├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── question-about-the-book.md
├── .gitignore
├── 01_introduction.ipynb
├── 02_classification.ipynb
├── 03_transformer-anatomy.ipynb
├── 04_multilingual-ner.ipynb
├── 05_text-generation.ipynb
├── 06_summarization.ipynb
├── 07_question-answering.ipynb
├── 07_question_answering_v2.ipynb
├── 08_model-compression.ipynb
├── 09_few-to-no-labels.ipynb
├── 10_transformers-from-scratch.ipynb
├── 11_future-directions.ipynb
├── LICENSE
├── README.md
├── SageMaker
    ├── 01_introduction.ipynb
    ├── 02_classification.ipynb
    ├── README.md
    ├── images
    │   ├── git_repo.png
    │   ├── iam_role.png
    │   └── notebook_config.png
    ├── scripts
    │   └── 02_classification_train.py
    └── utils.py
├── data
    └── github-issues-transformers.jsonl
├── environment-chapter7.yml
├── environment.yml
├── images
    ├── book_cover.jpg
    ├── chapter01_enc-dec-attn.png
    ├── chapter01_enc-dec.png
    ├── chapter01_hf-ecosystem.png
    ├── chapter01_hub-model-card.png
    ├── chapter01_hub-overview.png
    ├── chapter01_rnn.png
    ├── chapter01_self-attention.png
    ├── chapter01_timeline.png
    ├── chapter01_transfer-learning.png
    ├── chapter01_ulmfit.png
    ├── chapter02_attention-alignment.png
    ├── chapter02_attention-mask.png
    ├── chapter02_encoder-classifier.png
    ├── chapter02_encoder-feature-based.png
    ├── chapter02_encoder-fine-tuning.png
    ├── chapter02_hf-libraries.png
    ├── chapter02_transformers-compact.html
    ├── chapter02_transformers.html
    ├── chapter02_transformers.png
    ├── chapter02_tweet.png
    ├── chapter03_attention-ops.png
    ├── chapter03_contextualized-embedding.png
    ├── chapter03_decoder-zoom.png
    ├── chapter03_encoder-zoom.png
    ├── chapter03_layer-norm.png
    ├── chapter03_multihead-attention.png
    ├── chapter03_transformer-encoder-decoder.png
    ├── chapter03_transformers-compact.png
    ├── chapter04_bert-body-head.png
    ├── chapter04_clf-architecture.png
    ├── chapter04_ner-architecture.png
    ├── chapter04_ner-widget.png
    ├── chapter04_tokenizer-pipeline.png
    ├── chapter05_beam-search.png
    ├── chapter05_lm-meta-learning.png
    ├── chapter05_meena.png
    ├── chapter05_text-generation.png
    ├── chapter07_dpr.png
    ├── chapter07_marie-curie.png
    ├── chapter07_phone.png
    ├── chapter07_qa-head.png
    ├── chapter07_qa-pyramid.png
    ├── chapter07_rag-architecture.png
    ├── chapter07_retriever-reader.png
    ├── chapter07_sliding-window.png
    ├── chapter07_squad-models.png
    ├── chapter07_squad-schema.png
    ├── chapter07_squad-sota.png
    ├── chapter08_bert-onnx.png
    ├── chapter08_fp32-to-int8.png
    ├── chapter08_kd.png
    ├── chapter08_magnitude-vs-movement.png
    ├── chapter08_network-pruning.png
    ├── chapter08_onnx-ort.png
    ├── chapter08_oos.png
    ├── chapter08_pegasus.png
    ├── chapter08_pruning-dists.png
    ├── chapter08_roblox.png
    ├── chapter08_soft-probs.png
    ├── chapter08_t5.png
    ├── chapter09_decision-tree.png
    ├── chapter09_faiss-index.png
    ├── chapter09_issue-example-v2.png
    ├── chapter09_nearest-neighbours.png
    ├── chapter09_uda.png
    ├── chapter09_ust.png
    ├── chapter10_code-snippet.png
    ├── chapter10_ddp.png
    ├── chapter10_preprocessing-clm.png
    ├── chapter10_pretraining-clm.png
    ├── chapter10_pretraining-mlm.png
    ├── chapter10_pretraining-seq2seq.png
    ├── chapter11_atomic-sparse-attention.png
    ├── chapter11_clip-arch.png
    ├── chapter11_compound-sparse-attention.png
    ├── chapter11_dall-e.png
    ├── chapter11_efficient-attention.png
    ├── chapter11_iGPT.png
    ├── chapter11_layoutlm.png
    ├── chapter11_linear-attention.png
    ├── chapter11_scaling-modal.png
    ├── chapter11_scaling.png
    ├── chapter11_table-qa.png
    ├── chapter11_tapas-architecture.png
    ├── chapter11_vit-architecture.png
    ├── chapter11_vqa.png
    ├── chapter11_wav2vec-u.png
    ├── chapter11_wav2vec2.png
    ├── doge.jpg
    └── optimusprime.jpg
├── install.py
├── plotting.mplstyle
├── requirements-chapter7-v2.txt
├── requirements-chapter7.txt
├── requirements.txt
├── scripts
    └── create_notebook_table.py
├── settings.ini
└── utils.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Submit a report to help us improve the book
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Information
11 | 
12 | The problem arises in chapter:
13 | 
14 | * [ ] Introduction
15 | * [ ] Text Classification
16 | * [ ] Transformer Anatomy
17 | * [ ] Multilingual Named Entity Recognition
18 | * [ ] Text Generation
19 | * [ ] Summarization
20 | * [ ] Question Answering
21 | * [ ] Making Transformers Efficient in Production
22 | * [ ] Dealing with Few to No Labels
23 | * [ ] Training Transformers from Scratch
24 | * [ ] Future Directions
25 | 
26 | 
27 | ## Describe the bug
28 | <!-- A clear and concise description of what the bug is. -->
29 | 
30 | ## To Reproduce
31 | Steps to reproduce the behavior:
32 | 
33 | 1.
34 | 2.
35 | 3.
36 | 
37 | <!-- If you have code snippets, error messages, stack traces please provide them here as well.
38 |      Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
39 |      Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.-->
40 | 
41 | ## Expected behavior
42 | 
43 | <!-- A clear and concise description of what you would expect to happen. -->


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question-about-the-book.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question or comment about the book
 3 | about: Have a general question or comment about the book content? Ask it here!
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Information
11 | 
12 | The question or comment is about chapter:
13 | 
14 | * [ ] Introduction
15 | * [ ] Text Classification
16 | * [ ] Transformer Anatomy
17 | * [ ] Multilingual Named Entity Recognition
18 | * [ ] Text Generation
19 | * [ ] Summarization
20 | * [ ] Question Answering
21 | * [ ] Making Transformers Efficient in Production
22 | * [ ] Dealing with Few to No Labels
23 | * [ ] Training Transformers from Scratch
24 | * [ ] Future Directions
25 | 
26 | ## Question or comment


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # mac
132 | .DS_Store
133 | 


--------------------------------------------------------------------------------
/01_introduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Uncomment and run this cell if you're on Colab or Kaggle\n",
 10 |     "# !git clone https://github.com/nlp-with-transformers/notebooks.git\n",
 11 |     "# %cd notebooks\n",
 12 |     "# from install import *\n",
 13 |     "# install_requirements()"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "#hide\n",
 23 |     "from utils import *\n",
 24 |     "setup_chapter()"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "# Hello Transformers"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "<img alt=\"transformer-timeline\" caption=\"The transformers timeline\" src=\"images/chapter01_timeline.png\" id=\"transformer-timeline\"/>"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## The Encoder-Decoder Framework"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "<img alt=\"rnn\" caption=\"Unrolling an RNN in time.\" src=\"images/chapter01_rnn.png\" id=\"rnn\"/>"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "<img alt=\"enc-dec\" caption=\"Encoder-decoder architecture with a pair of RNNs. In general, there are many more recurrent layers than those shown.\" src=\"images/chapter01_enc-dec.png\" id=\"enc-dec\"/>"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "## Attention Mechanisms"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "<img alt=\"enc-dec-attn\" caption=\"Encoder-decoder architecture with an attention mechanism for a pair of RNNs.\" src=\"images/chapter01_enc-dec-attn.png\" id=\"enc-dec-attn\"/> "
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "<img alt=\"attention-alignment\" width=\"500\" caption=\"RNN encoder-decoder alignment of words in English and the generated translation in French (courtesy of Dzmitry Bahdanau).\" src=\"images/chapter02_attention-alignment.png\" id=\"attention-alignment\"/> "
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "<img alt=\"transformer-self-attn\" caption=\"Encoder-decoder architecture of the original Transformer.\" src=\"images/chapter01_self-attention.png\" id=\"transformer-self-attn\"/> "
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "## Transfer Learning in NLP"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "<img alt=\"transfer-learning\" caption=\"Comparison of traditional supervised learning (left) and transfer learning (right).\" src=\"images/chapter01_transfer-learning.png\" id=\"transfer-learning\"/>  "
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "<img alt=\"ulmfit\" width=\"500\" caption=\"The ULMFiT process (courtesy of Jeremy Howard).\" src=\"images/chapter01_ulmfit.png\" id=\"ulmfit\"/>"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "## Hugging Face Transformers: Bridging the Gap"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "## A Tour of Transformer Applications"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "text = \"\"\"Dear Amazon, last week I ordered an Optimus Prime action figure \\\n",
132 |     "from your online store in Germany. Unfortunately, when I opened the package, \\\n",
133 |     "I discovered to my horror that I had been sent an action figure of Megatron \\\n",
134 |     "instead! As a lifelong enemy of the Decepticons, I hope you can understand my \\\n",
135 |     "dilemma. To resolve the issue, I demand an exchange of Megatron for the \\\n",
136 |     "Optimus Prime figure I ordered. Enclosed are copies of my records concerning \\\n",
137 |     "this purchase. I expect to hear from you soon. Sincerely, Bumblebee.\"\"\""
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "### Text Classification"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "#hide_output\n",
154 |     "from transformers import pipeline\n",
155 |     "\n",
156 |     "classifier = pipeline(\"text-classification\")"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [
164 |     {
165 |      "data": {
166 |       "text/html": [
167 |        "<div>\n",
168 |        "<style scoped>\n",
169 |        "    .dataframe tbody tr th:only-of-type {\n",
170 |        "        vertical-align: middle;\n",
171 |        "    }\n",
172 |        "\n",
173 |        "    .dataframe tbody tr th {\n",
174 |        "        vertical-align: top;\n",
175 |        "    }\n",
176 |        "\n",
177 |        "    .dataframe thead th {\n",
178 |        "        text-align: right;\n",
179 |        "    }\n",
180 |        "</style>\n",
181 |        "<table border=\"1\" class=\"dataframe\">\n",
182 |        "  <thead>\n",
183 |        "    <tr style=\"text-align: right;\">\n",
184 |        "      <th></th>\n",
185 |        "      <th>label</th>\n",
186 |        "      <th>score</th>\n",
187 |        "    </tr>\n",
188 |        "  </thead>\n",
189 |        "  <tbody>\n",
190 |        "    <tr>\n",
191 |        "      <th>0</th>\n",
192 |        "      <td>NEGATIVE</td>\n",
193 |        "      <td>0.901546</td>\n",
194 |        "    </tr>\n",
195 |        "  </tbody>\n",
196 |        "</table>\n",
197 |        "</div>"
198 |       ],
199 |       "text/plain": [
200 |        "      label     score\n",
201 |        "0  NEGATIVE  0.901546"
202 |       ]
203 |      },
204 |      "execution_count": null,
205 |      "metadata": {},
206 |      "output_type": "execute_result"
207 |     }
208 |    ],
209 |    "source": [
210 |     "import pandas as pd\n",
211 |     "\n",
212 |     "outputs = classifier(text)\n",
213 |     "pd.DataFrame(outputs)    "
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "### Named Entity Recognition"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [
228 |     {
229 |      "data": {
230 |       "text/html": [
231 |        "<div>\n",
232 |        "<style scoped>\n",
233 |        "    .dataframe tbody tr th:only-of-type {\n",
234 |        "        vertical-align: middle;\n",
235 |        "    }\n",
236 |        "\n",
237 |        "    .dataframe tbody tr th {\n",
238 |        "        vertical-align: top;\n",
239 |        "    }\n",
240 |        "\n",
241 |        "    .dataframe thead th {\n",
242 |        "        text-align: right;\n",
243 |        "    }\n",
244 |        "</style>\n",
245 |        "<table border=\"1\" class=\"dataframe\">\n",
246 |        "  <thead>\n",
247 |        "    <tr style=\"text-align: right;\">\n",
248 |        "      <th></th>\n",
249 |        "      <th>entity_group</th>\n",
250 |        "      <th>score</th>\n",
251 |        "      <th>word</th>\n",
252 |        "      <th>start</th>\n",
253 |        "      <th>end</th>\n",
254 |        "    </tr>\n",
255 |        "  </thead>\n",
256 |        "  <tbody>\n",
257 |        "    <tr>\n",
258 |        "      <th>0</th>\n",
259 |        "      <td>ORG</td>\n",
260 |        "      <td>0.879010</td>\n",
261 |        "      <td>Amazon</td>\n",
262 |        "      <td>5</td>\n",
263 |        "      <td>11</td>\n",
264 |        "    </tr>\n",
265 |        "    <tr>\n",
266 |        "      <th>1</th>\n",
267 |        "      <td>MISC</td>\n",
268 |        "      <td>0.990859</td>\n",
269 |        "      <td>Optimus Prime</td>\n",
270 |        "      <td>36</td>\n",
271 |        "      <td>49</td>\n",
272 |        "    </tr>\n",
273 |        "    <tr>\n",
274 |        "      <th>2</th>\n",
275 |        "      <td>LOC</td>\n",
276 |        "      <td>0.999755</td>\n",
277 |        "      <td>Germany</td>\n",
278 |        "      <td>90</td>\n",
279 |        "      <td>97</td>\n",
280 |        "    </tr>\n",
281 |        "    <tr>\n",
282 |        "      <th>3</th>\n",
283 |        "      <td>MISC</td>\n",
284 |        "      <td>0.556569</td>\n",
285 |        "      <td>Mega</td>\n",
286 |        "      <td>208</td>\n",
287 |        "      <td>212</td>\n",
288 |        "    </tr>\n",
289 |        "    <tr>\n",
290 |        "      <th>4</th>\n",
291 |        "      <td>PER</td>\n",
292 |        "      <td>0.590256</td>\n",
293 |        "      <td>##tron</td>\n",
294 |        "      <td>212</td>\n",
295 |        "      <td>216</td>\n",
296 |        "    </tr>\n",
297 |        "    <tr>\n",
298 |        "      <th>5</th>\n",
299 |        "      <td>ORG</td>\n",
300 |        "      <td>0.669692</td>\n",
301 |        "      <td>Decept</td>\n",
302 |        "      <td>253</td>\n",
303 |        "      <td>259</td>\n",
304 |        "    </tr>\n",
305 |        "    <tr>\n",
306 |        "      <th>6</th>\n",
307 |        "      <td>MISC</td>\n",
308 |        "      <td>0.498350</td>\n",
309 |        "      <td>##icons</td>\n",
310 |        "      <td>259</td>\n",
311 |        "      <td>264</td>\n",
312 |        "    </tr>\n",
313 |        "    <tr>\n",
314 |        "      <th>7</th>\n",
315 |        "      <td>MISC</td>\n",
316 |        "      <td>0.775361</td>\n",
317 |        "      <td>Megatron</td>\n",
318 |        "      <td>350</td>\n",
319 |        "      <td>358</td>\n",
320 |        "    </tr>\n",
321 |        "    <tr>\n",
322 |        "      <th>8</th>\n",
323 |        "      <td>MISC</td>\n",
324 |        "      <td>0.987854</td>\n",
325 |        "      <td>Optimus Prime</td>\n",
326 |        "      <td>367</td>\n",
327 |        "      <td>380</td>\n",
328 |        "    </tr>\n",
329 |        "    <tr>\n",
330 |        "      <th>9</th>\n",
331 |        "      <td>PER</td>\n",
332 |        "      <td>0.812096</td>\n",
333 |        "      <td>Bumblebee</td>\n",
334 |        "      <td>502</td>\n",
335 |        "      <td>511</td>\n",
336 |        "    </tr>\n",
337 |        "  </tbody>\n",
338 |        "</table>\n",
339 |        "</div>"
340 |       ],
341 |       "text/plain": [
342 |        "  entity_group     score           word  start  end\n",
343 |        "0          ORG  0.879010         Amazon      5   11\n",
344 |        "1         MISC  0.990859  Optimus Prime     36   49\n",
345 |        "2          LOC  0.999755        Germany     90   97\n",
346 |        "3         MISC  0.556569           Mega    208  212\n",
347 |        "4          PER  0.590256         ##tron    212  216\n",
348 |        "5          ORG  0.669692         Decept    253  259\n",
349 |        "6         MISC  0.498350        ##icons    259  264\n",
350 |        "7         MISC  0.775361       Megatron    350  358\n",
351 |        "8         MISC  0.987854  Optimus Prime    367  380\n",
352 |        "9          PER  0.812096      Bumblebee    502  511"
353 |       ]
354 |      },
355 |      "execution_count": null,
356 |      "metadata": {},
357 |      "output_type": "execute_result"
358 |     }
359 |    ],
360 |    "source": [
361 |     "ner_tagger = pipeline(\"ner\", aggregation_strategy=\"simple\")\n",
362 |     "outputs = ner_tagger(text)\n",
363 |     "pd.DataFrame(outputs)    "
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "### Question Answering "
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": null,
376 |    "metadata": {},
377 |    "outputs": [
378 |     {
379 |      "data": {
380 |       "text/html": [
381 |        "<div>\n",
382 |        "<style scoped>\n",
383 |        "    .dataframe tbody tr th:only-of-type {\n",
384 |        "        vertical-align: middle;\n",
385 |        "    }\n",
386 |        "\n",
387 |        "    .dataframe tbody tr th {\n",
388 |        "        vertical-align: top;\n",
389 |        "    }\n",
390 |        "\n",
391 |        "    .dataframe thead th {\n",
392 |        "        text-align: right;\n",
393 |        "    }\n",
394 |        "</style>\n",
395 |        "<table border=\"1\" class=\"dataframe\">\n",
396 |        "  <thead>\n",
397 |        "    <tr style=\"text-align: right;\">\n",
398 |        "      <th></th>\n",
399 |        "      <th>score</th>\n",
400 |        "      <th>start</th>\n",
401 |        "      <th>end</th>\n",
402 |        "      <th>answer</th>\n",
403 |        "    </tr>\n",
404 |        "  </thead>\n",
405 |        "  <tbody>\n",
406 |        "    <tr>\n",
407 |        "      <th>0</th>\n",
408 |        "      <td>0.631291</td>\n",
409 |        "      <td>335</td>\n",
410 |        "      <td>358</td>\n",
411 |        "      <td>an exchange of Megatron</td>\n",
412 |        "    </tr>\n",
413 |        "  </tbody>\n",
414 |        "</table>\n",
415 |        "</div>"
416 |       ],
417 |       "text/plain": [
418 |        "      score  start  end                   answer\n",
419 |        "0  0.631291    335  358  an exchange of Megatron"
420 |       ]
421 |      },
422 |      "execution_count": null,
423 |      "metadata": {},
424 |      "output_type": "execute_result"
425 |     }
426 |    ],
427 |    "source": [
428 |     "reader = pipeline(\"question-answering\")\n",
429 |     "question = \"What does the customer want?\"\n",
430 |     "outputs = reader(question=question, context=text)\n",
431 |     "pd.DataFrame([outputs])    "
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "markdown",
436 |    "metadata": {},
437 |    "source": [
438 |     "### Summarization"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": null,
444 |    "metadata": {},
445 |    "outputs": [
446 |     {
447 |      "name": "stdout",
448 |      "output_type": "stream",
449 |      "text": [
450 |       " Bumblebee ordered an Optimus Prime action figure from your online store in\n",
451 |       "Germany. Unfortunately, when I opened the package, I discovered to my horror\n",
452 |       "that I had been sent an action figure of Megatron instead.\n"
453 |      ]
454 |     }
455 |    ],
456 |    "source": [
457 |     "summarizer = pipeline(\"summarization\")\n",
458 |     "outputs = summarizer(text, max_length=45, clean_up_tokenization_spaces=True)\n",
459 |     "print(outputs[0]['summary_text'])"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "markdown",
464 |    "metadata": {},
465 |    "source": [
466 |     "### Translation"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": null,
472 |    "metadata": {},
473 |    "outputs": [
474 |     {
475 |      "name": "stdout",
476 |      "output_type": "stream",
477 |      "text": [
478 |       "Sehr geehrter Amazon, letzte Woche habe ich eine Optimus Prime Action Figur aus\n",
479 |       "Ihrem Online-Shop in Deutschland bestellt. Leider, als ich das Paket öffnete,\n",
480 |       "entdeckte ich zu meinem Entsetzen, dass ich stattdessen eine Action Figur von\n",
481 |       "Megatron geschickt worden war! Als lebenslanger Feind der Decepticons, Ich\n",
482 |       "hoffe, Sie können mein Dilemma verstehen. Um das Problem zu lösen, Ich fordere\n",
483 |       "einen Austausch von Megatron für die Optimus Prime Figur habe ich bestellt.\n",
484 |       "Anbei sind Kopien meiner Aufzeichnungen über diesen Kauf. Ich erwarte, bald von\n",
485 |       "Ihnen zu hören. Aufrichtig, Bumblebee.\n"
486 |      ]
487 |     }
488 |    ],
489 |    "source": [
490 |     "translator = pipeline(\"translation_en_to_de\", \n",
491 |     "                      model=\"Helsinki-NLP/opus-mt-en-de\")\n",
492 |     "outputs = translator(text, clean_up_tokenization_spaces=True, min_length=100)\n",
493 |     "print(outputs[0]['translation_text'])"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "markdown",
498 |    "metadata": {},
499 |    "source": [
500 |     "### Text Generation"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": null,
506 |    "metadata": {},
507 |    "outputs": [],
508 |    "source": [
509 |     "#hide\n",
510 |     "from transformers import set_seed\n",
511 |     "set_seed(42) # Set the seed to get reproducible results"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": null,
517 |    "metadata": {},
518 |    "outputs": [
519 |     {
520 |      "name": "stdout",
521 |      "output_type": "stream",
522 |      "text": [
523 |       "Dear Amazon, last week I ordered an Optimus Prime action figure from your online\n",
524 |       "store in Germany. Unfortunately, when I opened the package, I discovered to my\n",
525 |       "horror that I had been sent an action figure of Megatron instead! As a lifelong\n",
526 |       "enemy of the Decepticons, I hope you can understand my dilemma. To resolve the\n",
527 |       "issue, I demand an exchange of Megatron for the Optimus Prime figure I ordered.\n",
528 |       "Enclosed are copies of my records concerning this purchase. I expect to hear\n",
529 |       "from you soon. Sincerely, Bumblebee.\n",
530 |       "\n",
531 |       "Customer service response:\n",
532 |       "Dear Bumblebee, I am sorry to hear that your order was mixed up. The order was\n",
533 |       "completely mislabeled, which is very common in our online store, but I can\n",
534 |       "appreciate it because it was my understanding from this site and our customer\n",
535 |       "service of the previous day that your order was not made correct in our mind and\n",
536 |       "that we are in a process of resolving this matter. We can assure you that your\n",
537 |       "order\n"
538 |      ]
539 |     }
540 |    ],
541 |    "source": [
542 |     "generator = pipeline(\"text-generation\")\n",
543 |     "response = \"Dear Bumblebee, I am sorry to hear that your order was mixed up.\"\n",
544 |     "prompt = text + \"\\n\\nCustomer service response:\\n\" + response\n",
545 |     "outputs = generator(prompt, max_length=200)\n",
546 |     "print(outputs[0]['generated_text'])"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "markdown",
551 |    "metadata": {},
552 |    "source": [
553 |     "## The Hugging Face Ecosystem"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "markdown",
558 |    "metadata": {},
559 |    "source": [
560 |     "<img alt=\"ecosystem\" width=\"500\" caption=\"An overview of the Hugging Face ecosystem of libraries and the Hub.\" src=\"images/chapter01_hf-ecosystem.png\" id=\"ecosystem\"/>"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "markdown",
565 |    "metadata": {},
566 |    "source": [
567 |     "### The Hugging Face Hub"
568 |    ]
569 |   },
570 |   {
571 |    "cell_type": "markdown",
572 |    "metadata": {},
573 |    "source": [
574 |     "<img alt=\"hub-overview\" width=\"1000\" caption=\"The models page of the Hugging Face Hub, showing filters on the left and a list of models on the right.\" src=\"images/chapter01_hub-overview.png\" id=\"hub-overview\"/> "
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "markdown",
579 |    "metadata": {},
580 |    "source": [
581 |     "<img alt=\"hub-model-card\" width=\"1000\" caption=\"A example model card from the Hugging Face Hub. The inference widget is shown on the right, where you can interact with the model.\" src=\"images/chapter01_hub-model-card.png\" id=\"hub-model-card\"/> "
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "markdown",
586 |    "metadata": {},
587 |    "source": [
588 |     "### Hugging Face Tokenizers"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "markdown",
593 |    "metadata": {},
594 |    "source": [
595 |     "### Hugging Face Datasets"
596 |    ]
597 |   },
598 |   {
599 |    "cell_type": "markdown",
600 |    "metadata": {},
601 |    "source": [
602 |     "### Hugging Face Accelerate"
603 |    ]
604 |   },
605 |   {
606 |    "cell_type": "markdown",
607 |    "metadata": {},
608 |    "source": [
609 |     "## Main Challenges with Transformers"
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "markdown",
614 |    "metadata": {},
615 |    "source": [
616 |     "## Conclusion"
617 |    ]
618 |   },
619 |   {
620 |    "cell_type": "code",
621 |    "execution_count": null,
622 |    "metadata": {},
623 |    "outputs": [],
624 |    "source": []
625 |   }
626 |  ],
627 |  "metadata": {
628 |   "kernelspec": {
629 |    "display_name": "Python 3 (ipykernel)",
630 |    "language": "python",
631 |    "name": "python3"
632 |   }
633 |  },
634 |  "nbformat": 4,
635 |  "nbformat_minor": 4
636 | }
637 | 


--------------------------------------------------------------------------------
/10_transformers-from-scratch.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": null,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "# Uncomment and run this cell if you're on Colab or Kaggle\n",
  10 |     "# !git clone https://github.com/nlp-with-transformers/notebooks.git\n",
  11 |     "# %cd notebooks\n",
  12 |     "# from install import *\n",
  13 |     "# install_requirements(is_chapter10=True)"
  14 |    ]
  15 |   },
  16 |   {
  17 |    "cell_type": "code",
  18 |    "execution_count": null,
  19 |    "metadata": {},
  20 |    "outputs": [],
  21 |    "source": [
  22 |     "# hide\n",
  23 |     "from utils import *\n",
  24 |     "setup_chapter()"
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "markdown",
  29 |    "metadata": {},
  30 |    "source": [
  31 |     "# Training Transformers from Scratch"
  32 |    ]
  33 |   },
  34 |   {
  35 |    "cell_type": "markdown",
  36 |    "metadata": {},
  37 |    "source": [
  38 |     "> **Note:** In this chapter a large dataset and the script to train a large language model on a distributed infrastructure are built. As such not all the steps in this notebook are executable on platforms such as Colab or Kaggle. Either downscale the steps at critical points or use this notebook as an inspiration when building a script for distributed training."
  39 |    ]
  40 |   },
  41 |   {
  42 |    "cell_type": "markdown",
  43 |    "metadata": {},
  44 |    "source": [
  45 |     "## Large Datasets and Where to Find Them"
  46 |    ]
  47 |   },
  48 |   {
  49 |    "cell_type": "markdown",
  50 |    "metadata": {},
  51 |    "source": [
  52 |     "### Challenges of Building a Large-Scale Corpus"
  53 |    ]
  54 |   },
  55 |   {
  56 |    "cell_type": "code",
  57 |    "execution_count": null,
  58 |    "metadata": {},
  59 |    "outputs": [],
  60 |    "source": [
  61 |     "#hide_output\n",
  62 |     "from transformers import pipeline, set_seed\n",
  63 |     "\n",
  64 |     "generation_gpt = pipeline(\"text-generation\", model=\"openai-gpt\")\n",
  65 |     "generation_gpt2 = pipeline(\"text-generation\", model=\"gpt2\")"
  66 |    ]
  67 |   },
  68 |   {
  69 |    "cell_type": "code",
  70 |    "execution_count": null,
  71 |    "metadata": {},
  72 |    "outputs": [
  73 |     {
  74 |      "name": "stdout",
  75 |      "output_type": "stream",
  76 |      "text": [
  77 |       "GPT  size: 116.5M parameters\n",
  78 |       "GPT2 size: 124.4M parameters\n"
  79 |      ]
  80 |     }
  81 |    ],
  82 |    "source": [
  83 |     "def model_size(model):\n",
  84 |     "    return sum(t.numel() for t in model.parameters())\n",
  85 |     "\n",
  86 |     "print(f\"GPT  size: {model_size(generation_gpt.model)/1000**2:.1f}M parameters\")\n",
  87 |     "print(f\"GPT2 size: {model_size(generation_gpt2.model)/1000**2:.1f}M parameters\")"
  88 |    ]
  89 |   },
  90 |   {
  91 |    "cell_type": "code",
  92 |    "execution_count": null,
  93 |    "metadata": {},
  94 |    "outputs": [],
  95 |    "source": [
  96 |     "# hide\n",
  97 |     "set_seed(1)"
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "code",
 102 |    "execution_count": null,
 103 |    "metadata": {},
 104 |    "outputs": [
 105 |     {
 106 |      "name": "stdout",
 107 |      "output_type": "stream",
 108 |      "text": [
 109 |       "GPT completions:\n",
 110 |       "1.\n",
 111 |       "When they came back.\n",
 112 |       " \" we need all we can get, \" jason said once they had settled into the back of\n",
 113 |       "the truck without anyone stopping them. \" after getting out here, it 'll be up\n",
 114 |       "to us what to find. for now\n",
 115 |       "2.\n",
 116 |       "When they came back.\n",
 117 |       " his gaze swept over her body. he 'd dressed her, too, in the borrowed clothes\n",
 118 |       "that she 'd worn for the journey.\n",
 119 |       " \" i thought it would be easier to just leave you there. \" a woman like\n",
 120 |       "3.\n",
 121 |       "When they came back to the house and she was sitting there with the little boy.\n",
 122 |       " \" don't be afraid, \" he told her. she nodded slowly, her eyes wide. she was so\n",
 123 |       "lost in whatever she discovered that tom knew her mistake\n",
 124 |       "\n",
 125 |       "GPT-2 completions:\n",
 126 |       "1.\n",
 127 |       "When they came back we had a big dinner and the other guys went to see what\n",
 128 |       "their opinion was on her. I did an hour and they were happy with it.\n",
 129 |       "2.\n",
 130 |       "When they came back to this island there had been another massacre, but he could\n",
 131 |       "not help but feel pity for the helpless victim who had been left to die, and\n",
 132 |       "that they had failed that day. And so was very, very grateful indeed.\n",
 133 |       "3.\n",
 134 |       "When they came back to our house after the morning, I asked if she was sure. She\n",
 135 |       "said, \"Nope.\" The two kids were gone that morning. I thought they were back to\n",
 136 |       "being a good friend.\n",
 137 |       "\n",
 138 |       "When Dost\n"
 139 |      ]
 140 |     }
 141 |    ],
 142 |    "source": [
 143 |     "def enum_pipeline_ouputs(pipe, prompt, num_return_sequences):\n",
 144 |     "    out = pipe(prompt, num_return_sequences=num_return_sequences,\n",
 145 |     "               clean_up_tokenization_spaces=True)\n",
 146 |     "    return \"\\n\".join(f\"{i+1}.\" + s[\"generated_text\"] for i, s in enumerate(out))\n",
 147 |     "\n",
 148 |     "prompt = \"\\nWhen they came back\"\n",
 149 |     "print(\"GPT completions:\\n\" + enum_pipeline_ouputs(generation_gpt, prompt, 3))\n",
 150 |     "print(\"\")\n",
 151 |     "print(\"GPT-2 completions:\\n\" + enum_pipeline_ouputs(generation_gpt2, prompt, 3))"
 152 |    ]
 153 |   },
 154 |   {
 155 |    "cell_type": "markdown",
 156 |    "metadata": {},
 157 |    "source": [
 158 |     "### Building a Custom Code Dataset\n"
 159 |    ]
 160 |   },
 161 |   {
 162 |    "cell_type": "markdown",
 163 |    "metadata": {},
 164 |    "source": [
 165 |     "#### Creating a dataset with Google BigQuery"
 166 |    ]
 167 |   },
 168 |   {
 169 |    "cell_type": "markdown",
 170 |    "metadata": {},
 171 |    "source": [
 172 |     "#sidebar To Filter the Noise or Not?"
 173 |    ]
 174 |   },
 175 |   {
 176 |    "cell_type": "markdown",
 177 |    "metadata": {},
 178 |    "source": [
 179 |     "### Working with Large Datasets"
 180 |    ]
 181 |   },
 182 |   {
 183 |    "cell_type": "markdown",
 184 |    "metadata": {},
 185 |    "source": [
 186 |     "#### Memory mapping"
 187 |    ]
 188 |   },
 189 |   {
 190 |    "cell_type": "markdown",
 191 |    "metadata": {},
 192 |    "source": [
 193 |     "> **Note:** The following code block assumes that you have downloaded the BigQuery dataset to a folder called `codeparrot`. We suggest skipping this step since it will unpack the compressed files and require ~180GB of disk space. This code is just for demonstration purposes and you can just continue below with the streamed dataset which will not consume that much disk space."
 194 |    ]
 195 |   },
 196 |   {
 197 |    "cell_type": "code",
 198 |    "execution_count": null,
 199 |    "metadata": {},
 200 |    "outputs": [],
 201 |    "source": [
 202 |     "#hide_output\n",
 203 |     "from datasets import load_dataset, DownloadConfig\n",
 204 |     "\n",
 205 |     "download_config = DownloadConfig(delete_extracted=True)\n",
 206 |     "dataset = load_dataset(\"./codeparrot\", split=\"train\",\n",
 207 |     "                       download_config=download_config)"
 208 |    ]
 209 |   },
 210 |   {
 211 |    "cell_type": "code",
 212 |    "execution_count": null,
 213 |    "metadata": {},
 214 |    "outputs": [
 215 |     {
 216 |      "name": "stdout",
 217 |      "output_type": "stream",
 218 |      "text": [
 219 |       "Number of python files code in dataset : 18695559\n",
 220 |       "Dataset size (cache file) : 183.68 GB\n",
 221 |       "RAM memory used: 4924 MB\n"
 222 |      ]
 223 |     }
 224 |    ],
 225 |    "source": [
 226 |     "import psutil, os\n",
 227 |     "\n",
 228 |     "print(f\"Number of python files code in dataset : {len(dataset)}\")\n",
 229 |     "ds_size = sum(os.stat(f[\"filename\"]).st_size for f in dataset.cache_files)\n",
 230 |     "# os.stat.st_size is expressed in bytes, so we convert to GB\n",
 231 |     "print(f\"Dataset size (cache file) : {ds_size / 2**30:.2f} GB\")\n",
 232 |     "# Process.memory_info is expressed in bytes, so we convert to MB\n",
 233 |     "print(f\"RAM used: {psutil.Process(os.getpid()).memory_info().rss >> 20} MB\")"
 234 |    ]
 235 |   },
 236 |   {
 237 |    "cell_type": "markdown",
 238 |    "metadata": {},
 239 |    "source": [
 240 |     "#### Streaming"
 241 |    ]
 242 |   },
 243 |   {
 244 |    "cell_type": "code",
 245 |    "execution_count": null,
 246 |    "metadata": {},
 247 |    "outputs": [
 248 |     {
 249 |      "name": "stderr",
 250 |      "output_type": "stream",
 251 |      "text": [
 252 |       "Using custom data configuration default-cae7a1d2f0dbde67\n"
 253 |      ]
 254 |     }
 255 |    ],
 256 |    "source": [
 257 |     "# hide_output\n",
 258 |     "streamed_dataset = load_dataset('./codeparrot', split=\"train\", streaming=True)"
 259 |    ]
 260 |   },
 261 |   {
 262 |    "cell_type": "code",
 263 |    "execution_count": null,
 264 |    "metadata": {},
 265 |    "outputs": [
 266 |     {
 267 |      "name": "stdout",
 268 |      "output_type": "stream",
 269 |      "text": [
 270 |       "True\n",
 271 |       "True\n"
 272 |      ]
 273 |     }
 274 |    ],
 275 |    "source": [
 276 |     "iterator = iter(streamed_dataset)\n",
 277 |     "\n",
 278 |     "print(dataset[0] == next(iterator))\n",
 279 |     "print(dataset[1] == next(iterator))"
 280 |    ]
 281 |   },
 282 |   {
 283 |    "cell_type": "code",
 284 |    "execution_count": null,
 285 |    "metadata": {},
 286 |    "outputs": [],
 287 |    "source": [
 288 |     "remote_dataset = load_dataset('transformersbook/codeparrot', split=\"train\",\n",
 289 |     "                              streaming=True)"
 290 |    ]
 291 |   },
 292 |   {
 293 |    "cell_type": "markdown",
 294 |    "metadata": {},
 295 |    "source": [
 296 |     "### Adding Datasets to the Hugging Face Hub"
 297 |    ]
 298 |   },
 299 |   {
 300 |    "cell_type": "markdown",
 301 |    "metadata": {},
 302 |    "source": [
 303 |     "## Building a Tokenizer"
 304 |    ]
 305 |   },
 306 |   {
 307 |    "cell_type": "code",
 308 |    "execution_count": null,
 309 |    "metadata": {},
 310 |    "outputs": [
 311 |     {
 312 |      "data": {
 313 |       "application/vnd.jupyter.widget-view+json": {
 314 |        "model_id": "29ced71e91434126970160a03cc006a5",
 315 |        "version_major": 2,
 316 |        "version_minor": 0
 317 |       },
 318 |       "text/plain": [
 319 |        "Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]"
 320 |       ]
 321 |      },
 322 |      "metadata": {},
 323 |      "output_type": "display_data"
 324 |     },
 325 |     {
 326 |      "data": {
 327 |       "application/vnd.jupyter.widget-view+json": {
 328 |        "model_id": "6f437f06babc4f01b5f02ed2e11a274f",
 329 |        "version_major": 2,
 330 |        "version_minor": 0
 331 |       },
 332 |       "text/plain": [
 333 |        "Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]"
 334 |       ]
 335 |      },
 336 |      "metadata": {},
 337 |      "output_type": "display_data"
 338 |     },
 339 |     {
 340 |      "data": {
 341 |       "application/vnd.jupyter.widget-view+json": {
 342 |        "model_id": "086fad17475145a2960cf393d6da4da5",
 343 |        "version_major": 2,
 344 |        "version_minor": 0
 345 |       },
 346 |       "text/plain": [
 347 |        "Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]"
 348 |       ]
 349 |      },
 350 |      "metadata": {},
 351 |      "output_type": "display_data"
 352 |     },
 353 |     {
 354 |      "data": {
 355 |       "application/vnd.jupyter.widget-view+json": {
 356 |        "model_id": "83b90218ddd54563b5abc524ed820741",
 357 |        "version_major": 2,
 358 |        "version_minor": 0
 359 |       },
 360 |       "text/plain": [
 361 |        "Downloading:   0%|          | 0.00/508 [00:00<?, ?B/s]"
 362 |       ]
 363 |      },
 364 |      "metadata": {},
 365 |      "output_type": "display_data"
 366 |     },
 367 |     {
 368 |      "data": {
 369 |       "application/vnd.jupyter.widget-view+json": {
 370 |        "model_id": "8fb62d059f0a4ab397767ec6497ab5ed",
 371 |        "version_major": 2,
 372 |        "version_minor": 0
 373 |       },
 374 |       "text/plain": [
 375 |        "Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]"
 376 |       ]
 377 |      },
 378 |      "metadata": {},
 379 |      "output_type": "display_data"
 380 |     },
 381 |     {
 382 |      "data": {
 383 |       "application/vnd.jupyter.widget-view+json": {
 384 |        "model_id": "b1fd604f95db44748a1b53a31be9ff5a",
 385 |        "version_major": 2,
 386 |        "version_minor": 0
 387 |       },
 388 |       "text/plain": [
 389 |        "Downloading:   0%|          | 0.00/1.33M [00:00<?, ?B/s]"
 390 |       ]
 391 |      },
 392 |      "metadata": {},
 393 |      "output_type": "display_data"
 394 |     }
 395 |    ],
 396 |    "source": [
 397 |     "# hide_output\n",
 398 |     "from transformers import AutoTokenizer\n",
 399 |     "\n",
 400 |     "def tok_list(tokenizer, string):\n",
 401 |     "    input_ids = tokenizer(string, add_special_tokens=False)[\"input_ids\"]\n",
 402 |     "    return [tokenizer.decode(tok) for tok in input_ids]\n",
 403 |     "\n",
 404 |     "tokenizer_T5 = AutoTokenizer.from_pretrained(\"t5-base\")\n",
 405 |     "tokenizer_camembert = AutoTokenizer.from_pretrained(\"camembert-base\")"
 406 |    ]
 407 |   },
 408 |   {
 409 |    "cell_type": "code",
 410 |    "execution_count": null,
 411 |    "metadata": {},
 412 |    "outputs": [
 413 |     {
 414 |      "name": "stdout",
 415 |      "output_type": "stream",
 416 |      "text": [
 417 |       "T5 tokens for \"sex\": ['', 's', 'ex']\n",
 418 |       "CamemBERT tokens for \"being\": ['be', 'ing']\n"
 419 |      ]
 420 |     }
 421 |    ],
 422 |    "source": [
 423 |     "print(f'T5 tokens for \"sex\": {tok_list(tokenizer_T5,\"sex\")}')\n",
 424 |     "print(f'CamemBERT tokens for \"being\": {tok_list(tokenizer_camembert,\"being\")}')"
 425 |    ]
 426 |   },
 427 |   {
 428 |    "cell_type": "markdown",
 429 |    "metadata": {},
 430 |    "source": [
 431 |     "### The Tokenizer Model"
 432 |    ]
 433 |   },
 434 |   {
 435 |    "cell_type": "markdown",
 436 |    "metadata": {},
 437 |    "source": [
 438 |     "### Measuring Tokenizer Performance"
 439 |    ]
 440 |   },
 441 |   {
 442 |    "cell_type": "markdown",
 443 |    "metadata": {},
 444 |    "source": [
 445 |     "### A Tokenizer for Python "
 446 |    ]
 447 |   },
 448 |   {
 449 |    "cell_type": "code",
 450 |    "execution_count": null,
 451 |    "metadata": {},
 452 |    "outputs": [
 453 |     {
 454 |      "name": "stdout",
 455 |      "output_type": "stream",
 456 |      "text": [
 457 |       "['def', 'Ġsay', '_', 'hello', '():', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġprint', '(\"',\n",
 458 |       "'Hello', ',', 'ĠWorld', '!\"', ')', 'Ġ#', 'ĠPrint', 'Ġit', 'Ċ', 'Ċ', 'say', '_',\n",
 459 |       "'hello', '()', 'Ċ']\n"
 460 |      ]
 461 |     }
 462 |    ],
 463 |    "source": [
 464 |     "from transformers import AutoTokenizer\n",
 465 |     "\n",
 466 |     "python_code = r\"\"\"def say_hello():\n",
 467 |     "    print(\"Hello, World!\")\n",
 468 |     "# Print it\n",
 469 |     "say_hello()\n",
 470 |     "\"\"\"\n",
 471 |     "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
 472 |     "print(tokenizer(python_code).tokens())"
 473 |    ]
 474 |   },
 475 |   {
 476 |    "cell_type": "code",
 477 |    "execution_count": null,
 478 |    "metadata": {},
 479 |    "outputs": [
 480 |     {
 481 |      "name": "stdout",
 482 |      "output_type": "stream",
 483 |      "text": [
 484 |       "None\n"
 485 |      ]
 486 |     }
 487 |    ],
 488 |    "source": [
 489 |     "print(tokenizer.backend_tokenizer.normalizer)"
 490 |    ]
 491 |   },
 492 |   {
 493 |    "cell_type": "code",
 494 |    "execution_count": null,
 495 |    "metadata": {},
 496 |    "outputs": [
 497 |     {
 498 |      "name": "stdout",
 499 |      "output_type": "stream",
 500 |      "text": [
 501 |       "[('def', (0, 3)), ('Ġsay', (3, 7)), ('_', (7, 8)), ('hello', (8, 13)), ('():',\n",
 502 |       "(13, 16)), ('ĊĠĠĠ', (16, 20)), ('Ġprint', (20, 26)), ('(\"', (26, 28)), ('Hello',\n",
 503 |       "(28, 33)), (',', (33, 34)), ('ĠWorld', (34, 40)), ('!\")', (40, 43)), ('Ġ#', (43,\n",
 504 |       "45)), ('ĠPrint', (45, 51)), ('Ġit', (51, 54)), ('Ċ', (54, 55)), ('Ċ', (55, 56)),\n",
 505 |       "('say', (56, 59)), ('_', (59, 60)), ('hello', (60, 65)), ('()', (65, 67)), ('Ċ',\n",
 506 |       "(67, 68))]\n"
 507 |      ]
 508 |     }
 509 |    ],
 510 |    "source": [
 511 |     "print(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(python_code))"
 512 |    ]
 513 |   },
 514 |   {
 515 |    "cell_type": "code",
 516 |    "execution_count": null,
 517 |    "metadata": {},
 518 |    "outputs": [
 519 |     {
 520 |      "name": "stdout",
 521 |      "output_type": "stream",
 522 |      "text": [
 523 |       "`a` is encoded as `b'a'` with a single byte: 97\n",
 524 |       "`€` is encoded as `b'\\xe2\\x82\\xac'` with three bytes: [226, 130, 172]\n"
 525 |      ]
 526 |     }
 527 |    ],
 528 |    "source": [
 529 |     "a, e = u\"a\", u\"€\"\n",
 530 |     "byte = ord(a.encode(\"utf-8\"))\n",
 531 |     "print(f'`{a}` is encoded as `{a.encode(\"utf-8\")}` with a single byte: {byte}')\n",
 532 |     "byte = [ord(chr(i)) for i in e.encode(\"utf-8\")]\n",
 533 |     "print(f'`{e}` is encoded as `{e.encode(\"utf-8\")}` with three bytes: {byte}')"
 534 |    ]
 535 |   },
 536 |   {
 537 |    "cell_type": "code",
 538 |    "execution_count": null,
 539 |    "metadata": {},
 540 |    "outputs": [
 541 |     {
 542 |      "name": "stdout",
 543 |      "output_type": "stream",
 544 |      "text": [
 545 |       "Size of our base vocabulary: 256\n",
 546 |       "First element: `!`, last element: `Ń`\n"
 547 |      ]
 548 |     }
 549 |    ],
 550 |    "source": [
 551 |     "from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode\n",
 552 |     "\n",
 553 |     "byte_to_unicode_map = bytes_to_unicode()\n",
 554 |     "unicode_to_byte_map = dict((v, k) for k, v in byte_to_unicode_map.items())\n",
 555 |     "base_vocab = list(unicode_to_byte_map.keys())\n",
 556 |     "\n",
 557 |     "print(f'Size of our base vocabulary: {len(base_vocab)}')\n",
 558 |     "print(f'First element: `{base_vocab[0]}`, last element: `{base_vocab[-1]}`')"
 559 |    ]
 560 |   },
 561 |   {
 562 |    "cell_type": "code",
 563 |    "execution_count": null,
 564 |    "metadata": {},
 565 |    "outputs": [
 566 |     {
 567 |      "data": {
 568 |       "text/html": [
 569 |        "<table border=\"1\" class=\"dataframe\">\n",
 570 |        "  <thead>\n",
 571 |        "    <tr style=\"text-align: right;\">\n",
 572 |        "      <th>Description</th>\n",
 573 |        "      <th>Character</th>\n",
 574 |        "      <th>Bytes</th>\n",
 575 |        "      <th>Mapped bytes</th>\n",
 576 |        "    </tr>\n",
 577 |        "  </thead>\n",
 578 |        "  <tbody>\n",
 579 |        "    <tr>\n",
 580 |        "      <td>Regular characters</td>\n",
 581 |        "      <td>`a` and `?`</td>\n",
 582 |        "      <td>97 and 63</td>\n",
 583 |        "      <td>`a` and `?`</td>\n",
 584 |        "    </tr>\n",
 585 |        "    <tr>\n",
 586 |        "      <td>Non-printable control character (CARRIAGE RETURN)</td>\n",
 587 |        "      <td>`U+000D`</td>\n",
 588 |        "      <td>13</td>\n",
 589 |        "      <td>`č`</td>\n",
 590 |        "    </tr>\n",
 591 |        "    <tr>\n",
 592 |        "      <td>A space</td>\n",
 593 |        "      <td>` `</td>\n",
 594 |        "      <td>32</td>\n",
 595 |        "      <td>`Ġ`</td>\n",
 596 |        "    </tr>\n",
 597 |        "    <tr>\n",
 598 |        "      <td>A non-breakable space</td>\n",
 599 |        "      <td>`\\xa0`</td>\n",
 600 |        "      <td>160</td>\n",
 601 |        "      <td>`ł`</td>\n",
 602 |        "    </tr>\n",
 603 |        "    <tr>\n",
 604 |        "      <td>A newline character</td>\n",
 605 |        "      <td>`\\n`</td>\n",
 606 |        "      <td>10</td>\n",
 607 |        "      <td>`Ċ`</td>\n",
 608 |        "    </tr>\n",
 609 |        "  </tbody>\n",
 610 |        "</table>"
 611 |       ],
 612 |       "text/plain": [
 613 |        "<IPython.core.display.HTML object>"
 614 |       ]
 615 |      },
 616 |      "metadata": {},
 617 |      "output_type": "display_data"
 618 |     }
 619 |    ],
 620 |    "source": [
 621 |     "# hide_input\n",
 622 |     "#id unicode_mapping\n",
 623 |     "#caption Examples of character mappings in BPE\n",
 624 |     "#hide_input\n",
 625 |     "import pandas as pd\n",
 626 |     "from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode\n",
 627 |     "\n",
 628 |     "byte_to_unicode_map = bytes_to_unicode()\n",
 629 |     "unicode_to_byte_map = dict((v, k) for k, v in byte_to_unicode_map.items())\n",
 630 |     "base_vocab = list(unicode_to_byte_map.keys())\n",
 631 |     "\n",
 632 |     "examples = [\n",
 633 |     "    ['Regular characters', '`a` and `?`', f'{ord(\"a\")} and {ord(\"?\")}' , f'`{byte_to_unicode_map[ord(\"a\")]}` and `{byte_to_unicode_map[ord(\"?\")]}`'],\n",
 634 |     "    ['Nonprintable control character (carriage return)', '`U+000D`', f'13', f'`{byte_to_unicode_map[13]}`'],\n",
 635 |     "    ['A space', '` `', f'{ord(\" \")}', f'`{byte_to_unicode_map[ord(\" \")]}`'],\n",
 636 |     "    ['A nonbreakable space', '`\\\\xa0`', '160', f'`{byte_to_unicode_map[ord(chr(160))]}`'],\n",
 637 |     "    ['A newline character', '`\\\\n`', '10', f'`{byte_to_unicode_map[ord(chr(10))]}`'],\n",
 638 |     "]\n",
 639 |     "\n",
 640 |     "pd.DataFrame(examples, columns = ['Description', 'Character', 'Bytes', 'Mapped bytes'])"
 641 |    ]
 642 |   },
 643 |   {
 644 |    "cell_type": "code",
 645 |    "execution_count": null,
 646 |    "metadata": {},
 647 |    "outputs": [
 648 |     {
 649 |      "name": "stdout",
 650 |      "output_type": "stream",
 651 |      "text": [
 652 |       "[('def', (0, 3)), ('Ġsay', (3, 7)), ('_', (7, 8)), ('hello', (8, 13)), ('():',\n",
 653 |       "(13, 16)), ('ĊĠĠĠ', (16, 20)), ('Ġprint', (20, 26)), ('(\"', (26, 28)), ('Hello',\n",
 654 |       "(28, 33)), (',', (33, 34)), ('ĠWorld', (34, 40)), ('!\")', (40, 43)), ('Ġ#', (43,\n",
 655 |       "45)), ('ĠPrint', (45, 51)), ('Ġit', (51, 54)), ('Ċ', (54, 55)), ('Ċ', (55, 56)),\n",
 656 |       "('say', (56, 59)), ('_', (59, 60)), ('hello', (60, 65)), ('()', (65, 67)), ('Ċ',\n",
 657 |       "(67, 68))]\n"
 658 |      ]
 659 |     }
 660 |    ],
 661 |    "source": [
 662 |     "print(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(python_code))"
 663 |    ]
 664 |   },
 665 |   {
 666 |    "cell_type": "code",
 667 |    "execution_count": null,
 668 |    "metadata": {},
 669 |    "outputs": [
 670 |     {
 671 |      "name": "stdout",
 672 |      "output_type": "stream",
 673 |      "text": [
 674 |       "Size of the vocabulary: 50257\n"
 675 |      ]
 676 |     }
 677 |    ],
 678 |    "source": [
 679 |     "print(f\"Size of the vocabulary: {len(tokenizer)}\")"
 680 |    ]
 681 |   },
 682 |   {
 683 |    "cell_type": "code",
 684 |    "execution_count": null,
 685 |    "metadata": {},
 686 |    "outputs": [
 687 |     {
 688 |      "name": "stdout",
 689 |      "output_type": "stream",
 690 |      "text": [
 691 |       "['def', 'Ġsay', '_', 'hello', '():', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġprint', '(\"',\n",
 692 |       "'Hello', ',', 'ĠWorld', '!\"', ')', 'Ġ#', 'ĠPrint', 'Ġit', 'Ċ', 'Ċ', 'say', '_',\n",
 693 |       "'hello', '()', 'Ċ']\n"
 694 |      ]
 695 |     }
 696 |    ],
 697 |    "source": [
 698 |     "print(tokenizer(python_code).tokens())"
 699 |    ]
 700 |   },
 701 |   {
 702 |    "cell_type": "markdown",
 703 |    "metadata": {},
 704 |    "source": [
 705 |     "### Training a Tokenizer"
 706 |    ]
 707 |   },
 708 |   {
 709 |    "cell_type": "code",
 710 |    "execution_count": null,
 711 |    "metadata": {},
 712 |    "outputs": [
 713 |     {
 714 |      "name": "stdout",
 715 |      "output_type": "stream",
 716 |      "text": [
 717 |       "['ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ', '\n",
 718 |       "=================================================================', '\n",
 719 |       "----------------------------------------------------------------',\n",
 720 |       "'................................................................',\n",
 721 |       "'ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ',\n",
 722 |       "'----------------------------------------------------------------',\n",
 723 |       "'================================================================',\n",
 724 |       "'________________________________________________________________']\n"
 725 |      ]
 726 |     }
 727 |    ],
 728 |    "source": [
 729 |     "tokens = sorted(tokenizer.vocab.items(), key=lambda x: len(x[0]), reverse=True)\n",
 730 |     "print([f'{tokenizer.convert_tokens_to_string(t)}' for t, _ in tokens[:8]]);"
 731 |    ]
 732 |   },
 733 |   {
 734 |    "cell_type": "code",
 735 |    "execution_count": null,
 736 |    "metadata": {},
 737 |    "outputs": [
 738 |     {
 739 |      "name": "stdout",
 740 |      "output_type": "stream",
 741 |      "text": [
 742 |       "['<|endoftext|>', ' gazed', ' informants', ' Collider', ' regress', 'ominated',\n",
 743 |       "' amplification', 'Compar', '….\"', ' (/', 'Commission', ' Hitman']\n"
 744 |      ]
 745 |     }
 746 |    ],
 747 |    "source": [
 748 |     "tokens = sorted(tokenizer.vocab.items(), key=lambda x: x[1], reverse=True)\n",
 749 |     "print([f'{tokenizer.convert_tokens_to_string(t)}' for t, _ in tokens[:12]]);"
 750 |    ]
 751 |   },
 752 |   {
 753 |    "cell_type": "code",
 754 |    "execution_count": null,
 755 |    "metadata": {},
 756 |    "outputs": [
 757 |     {
 758 |      "data": {
 759 |       "application/vnd.jupyter.widget-view+json": {
 760 |        "model_id": "743bca69d71649908db9ca5760af61d2",
 761 |        "version_major": 2,
 762 |        "version_minor": 0
 763 |       },
 764 |       "text/plain": [
 765 |        "Check remote data files:   0%|          | 0/183 [00:00<?, ?it/s]"
 766 |       ]
 767 |      },
 768 |      "metadata": {},
 769 |      "output_type": "display_data"
 770 |     },
 771 |     {
 772 |      "name": "stderr",
 773 |      "output_type": "stream",
 774 |      "text": [
 775 |       "Using custom data configuration codeparrot-train-99775fd6743284b5\n"
 776 |      ]
 777 |     }
 778 |    ],
 779 |    "source": [
 780 |     "#hide_output\n",
 781 |     "from tqdm.auto import tqdm\n",
 782 |     "\n",
 783 |     "length = 100000\n",
 784 |     "dataset_name = 'transformersbook/codeparrot-train'\n",
 785 |     "dataset = load_dataset(dataset_name, split=\"train\", streaming=True)\n",
 786 |     "iter_dataset = iter(dataset)\n",
 787 |     "\n",
 788 |     "def batch_iterator(batch_size=10):\n",
 789 |     "    for _ in tqdm(range(0, length, batch_size)):\n",
 790 |     "        yield [next(iter_dataset)['content'] for _ in range(batch_size)]\n",
 791 |     "\n",
 792 |     "new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), \n",
 793 |     "                                                  vocab_size=12500,\n",
 794 |     "                                                  initial_alphabet=base_vocab)"
 795 |    ]
 796 |   },
 797 |   {
 798 |    "cell_type": "code",
 799 |    "execution_count": null,
 800 |    "metadata": {},
 801 |    "outputs": [
 802 |     {
 803 |      "name": "stdout",
 804 |      "output_type": "stream",
 805 |      "text": [
 806 |       "['  ', '    ', '   ', '        ', 'se', 'in', '       ', 're', 'on', 'te', '\\n\n",
 807 |       "', '\\n        ', 'or', 'st', 'de', '\\n   ', 'th', 'le', ' =', 'lf', 'self',\n",
 808 |       "'me', 'al']\n"
 809 |      ]
 810 |     }
 811 |    ],
 812 |    "source": [
 813 |     "tokens = sorted(new_tokenizer.vocab.items(), key=lambda x: x[1], reverse=False)\n",
 814 |     "print([f'{tokenizer.convert_tokens_to_string(t)}' for t, _ in tokens[257:280]]);"
 815 |    ]
 816 |   },
 817 |   {
 818 |    "cell_type": "code",
 819 |    "execution_count": null,
 820 |    "metadata": {},
 821 |    "outputs": [
 822 |     {
 823 |      "name": "stdout",
 824 |      "output_type": "stream",
 825 |      "text": [
 826 |       "[' capt', ' embedded', ' regarding', 'Bundle', '355', ' recv', ' dmp', ' vault',\n",
 827 |       "' Mongo', ' possibly', 'implementation', 'Matches']\n"
 828 |      ]
 829 |     }
 830 |    ],
 831 |    "source": [
 832 |     "print([f'{new_tokenizer.convert_tokens_to_string(t)}' for t,_ in tokens[-12:]]);"
 833 |    ]
 834 |   },
 835 |   {
 836 |    "cell_type": "code",
 837 |    "execution_count": null,
 838 |    "metadata": {},
 839 |    "outputs": [
 840 |     {
 841 |      "name": "stdout",
 842 |      "output_type": "stream",
 843 |      "text": [
 844 |       "['def', 'Ġs', 'ay', '_', 'hello', '():', 'ĊĠĠĠ', 'Ġprint', '(\"', 'Hello', ',',\n",
 845 |       "'ĠWor', 'ld', '!\")', 'Ġ#', 'ĠPrint', 'Ġit', 'Ċ', 'Ċ', 's', 'ay', '_', 'hello',\n",
 846 |       "'()', 'Ċ']\n"
 847 |      ]
 848 |     }
 849 |    ],
 850 |    "source": [
 851 |     "print(new_tokenizer(python_code).tokens())"
 852 |    ]
 853 |   },
 854 |   {
 855 |    "cell_type": "code",
 856 |    "execution_count": null,
 857 |    "metadata": {},
 858 |    "outputs": [
 859 |     {
 860 |      "name": "stdout",
 861 |      "output_type": "stream",
 862 |      "text": [
 863 |       "There are in total 35 Python keywords.\n",
 864 |       "No, keyword `await` is not in the vocabulary\n",
 865 |       "No, keyword `finally` is not in the vocabulary\n",
 866 |       "No, keyword `nonlocal` is not in the vocabulary\n"
 867 |      ]
 868 |     }
 869 |    ],
 870 |    "source": [
 871 |     "import keyword\n",
 872 |     "\n",
 873 |     "print(f'There are in total {len(keyword.kwlist)} Python keywords.')\n",
 874 |     "for keyw in keyword.kwlist:\n",
 875 |     "    if keyw not in new_tokenizer.vocab:\n",
 876 |     "        print(f'No, keyword `{keyw}` is not in the vocabulary')"
 877 |    ]
 878 |   },
 879 |   {
 880 |    "cell_type": "code",
 881 |    "execution_count": null,
 882 |    "metadata": {},
 883 |    "outputs": [
 884 |     {
 885 |      "name": "stderr",
 886 |      "output_type": "stream",
 887 |      "text": [
 888 |       "100%|██████████| 200/200 [05:08<00:00,  1.54s/it]\n"
 889 |      ]
 890 |     }
 891 |    ],
 892 |    "source": [
 893 |     "# hide_output\n",
 894 |     "length = 200000\n",
 895 |     "new_tokenizer_larger = tokenizer.train_new_from_iterator(batch_iterator(),\n",
 896 |     "    vocab_size=32768, initial_alphabet=base_vocab)"
 897 |    ]
 898 |   },
 899 |   {
 900 |    "cell_type": "code",
 901 |    "execution_count": null,
 902 |    "metadata": {},
 903 |    "outputs": [
 904 |     {
 905 |      "name": "stdout",
 906 |      "output_type": "stream",
 907 |      "text": [
 908 |       "['lineEdit', 'spik', ' BC', 'pective', 'OTA', 'theus', 'FLUSH', ' excutils',\n",
 909 |       "'00000002', ' DIVISION', 'CursorPosition', ' InfoBar']\n"
 910 |      ]
 911 |     }
 912 |    ],
 913 |    "source": [
 914 |     "tokens = sorted(new_tokenizer_larger.vocab.items(), key=lambda x: x[1],\n",
 915 |     "                reverse=False)\n",
 916 |     "print([f'{tokenizer.convert_tokens_to_string(t)}' for t, _ in tokens[-12:]]);"
 917 |    ]
 918 |   },
 919 |   {
 920 |    "cell_type": "code",
 921 |    "execution_count": null,
 922 |    "metadata": {},
 923 |    "outputs": [
 924 |     {
 925 |      "name": "stdout",
 926 |      "output_type": "stream",
 927 |      "text": [
 928 |       "['def', 'Ġsay', '_', 'hello', '():', 'ĊĠĠĠ', 'Ġprint', '(\"', 'Hello', ',',\n",
 929 |       "'ĠWorld', '!\")', 'Ġ#', 'ĠPrint', 'Ġit', 'Ċ', 'Ċ', 'say', '_', 'hello', '()',\n",
 930 |       "'Ċ']\n"
 931 |      ]
 932 |     }
 933 |    ],
 934 |    "source": [
 935 |     "print(new_tokenizer_larger(python_code).tokens())"
 936 |    ]
 937 |   },
 938 |   {
 939 |    "cell_type": "code",
 940 |    "execution_count": null,
 941 |    "metadata": {},
 942 |    "outputs": [
 943 |     {
 944 |      "name": "stdout",
 945 |      "output_type": "stream",
 946 |      "text": [
 947 |       "No, keyword `nonlocal` is not in the vocabulary\n"
 948 |      ]
 949 |     }
 950 |    ],
 951 |    "source": [
 952 |     "for keyw in keyword.kwlist:\n",
 953 |     "    if keyw not in new_tokenizer_larger.vocab:\n",
 954 |     "        print(f'No, keyword `{keyw}` is not in the vocabulary')"
 955 |    ]
 956 |   },
 957 |   {
 958 |    "cell_type": "markdown",
 959 |    "metadata": {},
 960 |    "source": [
 961 |     "### Saving a Custom Tokenizer on the Hub"
 962 |    ]
 963 |   },
 964 |   {
 965 |    "cell_type": "code",
 966 |    "execution_count": null,
 967 |    "metadata": {},
 968 |    "outputs": [
 969 |     {
 970 |      "name": "stderr",
 971 |      "output_type": "stream",
 972 |      "text": [
 973 |       "Cloning https://huggingface.co/transformersbook/codeparrot into local empty directory.\n"
 974 |      ]
 975 |     },
 976 |     {
 977 |      "data": {
 978 |       "text/plain": [
 979 |        "'https://huggingface.co/transformersbook/codeparrot/commit/1c284adaa3cc9f8635ae7e3377bd3739f48bc09a'"
 980 |       ]
 981 |      },
 982 |      "execution_count": null,
 983 |      "metadata": {},
 984 |      "output_type": "execute_result"
 985 |     }
 986 |    ],
 987 |    "source": [
 988 |     "#hide_output\n",
 989 |     "model_ckpt = \"codeparrot\"\n",
 990 |     "org = \"transformersbook\"\n",
 991 |     "new_tokenizer_larger.push_to_hub(model_ckpt, organization=org)"
 992 |    ]
 993 |   },
 994 |   {
 995 |    "cell_type": "code",
 996 |    "execution_count": null,
 997 |    "metadata": {},
 998 |    "outputs": [
 999 |     {
1000 |      "name": "stdout",
1001 |      "output_type": "stream",
1002 |      "text": [
1003 |       "['def', 'Ġsay', '_', 'hello', '():', 'ĊĠĠĠ', 'Ġprint', '(\"', 'Hello', ',',\n",
1004 |       "'ĠWorld', '!\")', 'Ġ#', 'ĠPrint', 'Ġit', 'Ċ', 'Ċ', 'say', '_', 'hello', '()',\n",
1005 |       "'Ċ']\n"
1006 |      ]
1007 |     }
1008 |    ],
1009 |    "source": [
1010 |     "reloaded_tokenizer = AutoTokenizer.from_pretrained(org + \"/\" + model_ckpt)\n",
1011 |     "print(reloaded_tokenizer(python_code).tokens())"
1012 |    ]
1013 |   },
1014 |   {
1015 |    "cell_type": "code",
1016 |    "execution_count": null,
1017 |    "metadata": {},
1018 |    "outputs": [
1019 |     {
1020 |      "name": "stderr",
1021 |      "output_type": "stream",
1022 |      "text": [
1023 |       "Cloning https://huggingface.co/transformersbook/codeparrot-small-vocabulary into local empty directory.\n"
1024 |      ]
1025 |     },
1026 |     {
1027 |      "data": {
1028 |       "text/plain": [
1029 |        "'https://huggingface.co/transformersbook/codeparrot-small-vocabulary/commit/0b37bed9956d95d0b79ada169f6a281e15c63381'"
1030 |       ]
1031 |      },
1032 |      "execution_count": null,
1033 |      "metadata": {},
1034 |      "output_type": "execute_result"
1035 |     }
1036 |    ],
1037 |    "source": [
1038 |     "#hide_output\n",
1039 |     "new_tokenizer.push_to_hub(model_ckpt+ \"-small-vocabulary\", organization=org)"
1040 |    ]
1041 |   },
1042 |   {
1043 |    "cell_type": "markdown",
1044 |    "metadata": {},
1045 |    "source": [
1046 |     "## Training a Model from Scratch"
1047 |    ]
1048 |   },
1049 |   {
1050 |    "cell_type": "markdown",
1051 |    "metadata": {},
1052 |    "source": [
1053 |     "### A Tale of Pretraining Objectives"
1054 |    ]
1055 |   },
1056 |   {
1057 |    "cell_type": "markdown",
1058 |    "metadata": {},
1059 |    "source": [
1060 |     "<img alt=\"Code snippet\" caption=\"An example of a Python function that could be found in our dataset\" src=\"images/chapter10_code-snippet.png\" id=\"code-snippet\"/>"
1061 |    ]
1062 |   },
1063 |   {
1064 |    "cell_type": "markdown",
1065 |    "metadata": {},
1066 |    "source": [
1067 |     "#### Causal language modeling"
1068 |    ]
1069 |   },
1070 |   {
1071 |    "cell_type": "markdown",
1072 |    "metadata": {},
1073 |    "source": [
1074 |     "<img alt=\"CLM pretraining\" caption=\"In causal language modeling, the future tokens are masked and the model has to predict them; typically a decoder model such as GPT is used for such a task\" src=\"images/chapter10_pretraining-clm.png\" id=\"pretraining-clm\"/>"
1075 |    ]
1076 |   },
1077 |   {
1078 |    "cell_type": "markdown",
1079 |    "metadata": {},
1080 |    "source": [
1081 |     "#### Masked language modeling"
1082 |    ]
1083 |   },
1084 |   {
1085 |    "cell_type": "markdown",
1086 |    "metadata": {},
1087 |    "source": [
1088 |     "<img alt=\"MLM pretraining\" caption=\"In masked language modeling some of the input tokens are either masked or replaced, and the model's task is to predict the original tokens; this is the architecture underlying the encoder branch of transformer models\" src=\"images/chapter10_pretraining-mlm.png\" id=\"pretraining-mlm\"/>"
1089 |    ]
1090 |   },
1091 |   {
1092 |    "cell_type": "markdown",
1093 |    "metadata": {},
1094 |    "source": [
1095 |     "#### Sequence-to-sequence training"
1096 |    ]
1097 |   },
1098 |   {
1099 |    "cell_type": "markdown",
1100 |    "metadata": {},
1101 |    "source": [
1102 |     "<img alt=\"Seq2seq pretraining\" caption=\"Using an encoder-decoder architecture for a sequence-to-sequence task where the inputs are split into comment/code pairs using heuristics: the model gets one element as input and needs to generate the other one\" src=\"images/chapter10_pretraining-seq2seq.png\" id=\"pretraining-seq2seq\"/>"
1103 |    ]
1104 |   },
1105 |   {
1106 |    "cell_type": "markdown",
1107 |    "metadata": {},
1108 |    "source": [
1109 |     "### Initializing the Model"
1110 |    ]
1111 |   },
1112 |   {
1113 |    "cell_type": "markdown",
1114 |    "metadata": {},
1115 |    "source": [
1116 |     "> **NOTE**: In the following code block, a large GPT-2 checkpoint is loaded into memory. On platforms like Colab and Kaggle, this can cause the instance to crash due to insufficient RAM or GPU memory. You can still run the example if you use the small checkpoint by replacing the configuration with `config = AutoConfig.from_pretrained(\"gpt2\", vocab_size=len(tokenizer))`."
1117 |    ]
1118 |   },
1119 |   {
1120 |    "cell_type": "code",
1121 |    "execution_count": null,
1122 |    "metadata": {},
1123 |    "outputs": [
1124 |     {
1125 |      "data": {
1126 |       "application/vnd.jupyter.widget-view+json": {
1127 |        "model_id": "be84ca77ca144954af8ae4820ec6685b",
1128 |        "version_major": 2,
1129 |        "version_minor": 0
1130 |       },
1131 |       "text/plain": [
1132 |        "Downloading:   0%|          | 0.00/787 [00:00<?, ?B/s]"
1133 |       ]
1134 |      },
1135 |      "metadata": {},
1136 |      "output_type": "display_data"
1137 |     }
1138 |    ],
1139 |    "source": [
1140 |     "#hide_output\n",
1141 |     "from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer\n",
1142 |     "\n",
1143 |     "tokenizer = AutoTokenizer.from_pretrained(org + \"/\" + model_ckpt)\n",
1144 |     "config = AutoConfig.from_pretrained(\"gpt2-xl\", vocab_size=len(tokenizer))\n",
1145 |     "model = AutoModelForCausalLM.from_config(config)"
1146 |    ]
1147 |   },
1148 |   {
1149 |    "cell_type": "code",
1150 |    "execution_count": null,
1151 |    "metadata": {},
1152 |    "outputs": [
1153 |     {
1154 |      "name": "stdout",
1155 |      "output_type": "stream",
1156 |      "text": [
1157 |       "GPT-2 (xl) size: 1529.6M parameters\n"
1158 |      ]
1159 |     }
1160 |    ],
1161 |    "source": [
1162 |     "print(f'GPT-2 (xl) size: {model_size(model)/1000**2:.1f}M parameters')"
1163 |    ]
1164 |   },
1165 |   {
1166 |    "cell_type": "code",
1167 |    "execution_count": null,
1168 |    "metadata": {},
1169 |    "outputs": [],
1170 |    "source": [
1171 |     "#hide_output\n",
1172 |     "model.save_pretrained(\"models/\" + model_ckpt, push_to_hub=True,\n",
1173 |     "                      organization=org)"
1174 |    ]
1175 |   },
1176 |   {
1177 |    "cell_type": "code",
1178 |    "execution_count": null,
1179 |    "metadata": {},
1180 |    "outputs": [],
1181 |    "source": [
1182 |     "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
1183 |     "config_small = AutoConfig.from_pretrained(\"gpt2\", vocab_size=len(tokenizer))\n",
1184 |     "model_small = AutoModelForCausalLM.from_config(config_small)"
1185 |    ]
1186 |   },
1187 |   {
1188 |    "cell_type": "code",
1189 |    "execution_count": null,
1190 |    "metadata": {},
1191 |    "outputs": [
1192 |     {
1193 |      "name": "stdout",
1194 |      "output_type": "stream",
1195 |      "text": [
1196 |       "GPT-2 size: 111.0M parameters\n"
1197 |      ]
1198 |     }
1199 |    ],
1200 |    "source": [
1201 |     "print(f'GPT-2 size: {model_size(model_small)/1000**2:.1f}M parameters')"
1202 |    ]
1203 |   },
1204 |   {
1205 |    "cell_type": "code",
1206 |    "execution_count": null,
1207 |    "metadata": {},
1208 |    "outputs": [
1209 |     {
1210 |      "name": "stderr",
1211 |      "output_type": "stream",
1212 |      "text": [
1213 |       "Cloning https://huggingface.co/transformersbook/codeparrot-small into local empty directory.\n"
1214 |      ]
1215 |     }
1216 |    ],
1217 |    "source": [
1218 |     "#hide_output\n",
1219 |     "model_small.save_pretrained(\"models/\" + model_ckpt + \"-small\", push_to_hub=True,\n",
1220 |     "                            organization=org)"
1221 |    ]
1222 |   },
1223 |   {
1224 |    "cell_type": "markdown",
1225 |    "metadata": {},
1226 |    "source": [
1227 |     "### Implementing the Dataloader"
1228 |    ]
1229 |   },
1230 |   {
1231 |    "cell_type": "markdown",
1232 |    "metadata": {},
1233 |    "source": [
1234 |     "<img alt=\"Preprocessing for CLM\" caption=\"Preparing sequences of varying length for causal language modeling by concatenating several tokenized examples with an EOS token  before chunking them\" src=\"images/chapter10_preprocessing-clm.png\" id=\"preprocessing-clm\"/>"
1235 |    ]
1236 |   },
1237 |   {
1238 |    "cell_type": "code",
1239 |    "execution_count": null,
1240 |    "metadata": {},
1241 |    "outputs": [
1242 |     {
1243 |      "name": "stderr",
1244 |      "output_type": "stream",
1245 |      "text": [
1246 |       "  0%|          | 1/500 [00:00<01:16,  6.54it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2605 > 1024). Running this sequence through the model will result in indexing errors\n",
1247 |       "100%|██████████| 500/500 [00:04<00:00, 122.59it/s]\n"
1248 |      ]
1249 |     }
1250 |    ],
1251 |    "source": [
1252 |     "#hide_output\n",
1253 |     "examples, total_characters, total_tokens = 500, 0, 0\n",
1254 |     "dataset = load_dataset('transformersbook/codeparrot-train', split='train',\n",
1255 |     "                       streaming=True)\n",
1256 |     "\n",
1257 |     "for _, example in tqdm(zip(range(examples), iter(dataset)), total=examples):\n",
1258 |     "    total_characters += len(example['content'])\n",
1259 |     "    total_tokens += len(tokenizer(example['content']).tokens())\n",
1260 |     "\n",
1261 |     "characters_per_token = total_characters / total_tokens"
1262 |    ]
1263 |   },
1264 |   {
1265 |    "cell_type": "code",
1266 |    "execution_count": null,
1267 |    "metadata": {},
1268 |    "outputs": [
1269 |     {
1270 |      "name": "stdout",
1271 |      "output_type": "stream",
1272 |      "text": [
1273 |       "3.6233025034779565\n"
1274 |      ]
1275 |     }
1276 |    ],
1277 |    "source": [
1278 |     "print(characters_per_token)"
1279 |    ]
1280 |   },
1281 |   {
1282 |    "cell_type": "code",
1283 |    "execution_count": null,
1284 |    "metadata": {},
1285 |    "outputs": [],
1286 |    "source": [
1287 |     "import torch\n",
1288 |     "from torch.utils.data import IterableDataset\n",
1289 |     "\n",
1290 |     "class ConstantLengthDataset(IterableDataset):\n",
1291 |     "    \n",
1292 |     "    def __init__(self, tokenizer, dataset, seq_length=1024,\n",
1293 |     "                 num_of_sequences=1024, chars_per_token=3.6):\n",
1294 |     "        self.tokenizer = tokenizer\n",
1295 |     "        self.concat_token_id = tokenizer.eos_token_id\n",
1296 |     "        self.dataset = dataset\n",
1297 |     "        self.seq_length = seq_length\n",
1298 |     "        self.input_characters = seq_length * chars_per_token * num_of_sequences\n",
1299 |     "    \n",
1300 |     "    def __iter__(self):\n",
1301 |     "        iterator = iter(self.dataset)\n",
1302 |     "        more_examples = True\n",
1303 |     "        while more_examples:\n",
1304 |     "            buffer, buffer_len = [], 0\n",
1305 |     "            while True:\n",
1306 |     "                if buffer_len >= self.input_characters:\n",
1307 |     "                    m=f\"Buffer full: {buffer_len}>={self.input_characters:.0f}\"\n",
1308 |     "                    print(m)\n",
1309 |     "                    break\n",
1310 |     "                try:\n",
1311 |     "                    m=f\"Fill buffer: {buffer_len}<{self.input_characters:.0f}\"\n",
1312 |     "                    print(m)\n",
1313 |     "                    buffer.append(next(iterator)[\"content\"])\n",
1314 |     "                    buffer_len += len(buffer[-1])\n",
1315 |     "                except StopIteration:\n",
1316 |     "                    iterator = iter(self.dataset)\n",
1317 |     "\n",
1318 |     "            all_token_ids = []\n",
1319 |     "            tokenized_inputs = self.tokenizer(buffer, truncation=False)\n",
1320 |     "            for tokenized_input in tokenized_inputs['input_ids']:\n",
1321 |     "                all_token_ids.extend(tokenized_input + [self.concat_token_id])\n",
1322 |     "            \n",
1323 |     "            for i in range(0, len(all_token_ids), self.seq_length):\n",
1324 |     "                input_ids = all_token_ids[i : i + self.seq_length]\n",
1325 |     "                if len(input_ids) == self.seq_length:\n",
1326 |     "                    yield torch.tensor(input_ids)"
1327 |    ]
1328 |   },
1329 |   {
1330 |    "cell_type": "code",
1331 |    "execution_count": null,
1332 |    "metadata": {},
1333 |    "outputs": [
1334 |     {
1335 |      "name": "stdout",
1336 |      "output_type": "stream",
1337 |      "text": [
1338 |       "Fill buffer: 0<36864\n",
1339 |       "Fill buffer: 3311<36864\n",
1340 |       "Fill buffer: 9590<36864\n",
1341 |       "Fill buffer: 22177<36864\n",
1342 |       "Fill buffer: 25530<36864\n",
1343 |       "Fill buffer: 31098<36864\n",
1344 |       "Fill buffer: 32232<36864\n",
1345 |       "Fill buffer: 33867<36864\n",
1346 |       "Buffer full: 41172>=36864\n",
1347 |       "Lengths of the sequences: [1024, 1024, 1024, 1024, 1024]\n"
1348 |      ]
1349 |     }
1350 |    ],
1351 |    "source": [
1352 |     "shuffled_dataset = dataset.shuffle(buffer_size=100)\n",
1353 |     "constant_length_dataset = ConstantLengthDataset(tokenizer, shuffled_dataset,\n",
1354 |     "                                                num_of_sequences=10)\n",
1355 |     "dataset_iterator = iter(constant_length_dataset)\n",
1356 |     "\n",
1357 |     "lengths = [len(b) for _, b in zip(range(5), dataset_iterator)]\n",
1358 |     "print(f\"Lengths of the sequences: {lengths}\")"
1359 |    ]
1360 |   },
1361 |   {
1362 |    "cell_type": "markdown",
1363 |    "metadata": {},
1364 |    "source": [
1365 |     "### Defining the Training Loop"
1366 |    ]
1367 |   },
1368 |   {
1369 |    "cell_type": "code",
1370 |    "execution_count": null,
1371 |    "metadata": {},
1372 |    "outputs": [],
1373 |    "source": [
1374 |     "from argparse import Namespace\n",
1375 |     "\n",
1376 |     "# Commented parameters correspond to the small model\n",
1377 |     "config = {\"train_batch_size\": 2, # 12\n",
1378 |     "          \"valid_batch_size\": 2, # 12\n",
1379 |     "          \"weight_decay\": 0.1,\n",
1380 |     "          \"shuffle_buffer\": 1000,\n",
1381 |     "          \"learning_rate\": 2e-4, # 5e-4\n",
1382 |     "          \"lr_scheduler_type\": \"cosine\",\n",
1383 |     "          \"num_warmup_steps\": 750, # 2000\n",
1384 |     "          \"gradient_accumulation_steps\": 16, # 1\n",
1385 |     "          \"max_train_steps\": 50000, # 150000\n",
1386 |     "          \"max_eval_steps\": -1,\n",
1387 |     "          \"seq_length\": 1024,\n",
1388 |     "          \"seed\": 1,\n",
1389 |     "          \"save_checkpoint_steps\": 50000} # 15000\n",
1390 |     "\n",
1391 |     "args = Namespace(**config)"
1392 |    ]
1393 |   },
1394 |   {
1395 |    "cell_type": "code",
1396 |    "execution_count": null,
1397 |    "metadata": {},
1398 |    "outputs": [],
1399 |    "source": [
1400 |     "from torch.utils.tensorboard import SummaryWriter\n",
1401 |     "import logging\n",
1402 |     "import wandb\n",
1403 |     "\n",
1404 |     "def setup_logging(project_name):\n",
1405 |     "    logger = logging.getLogger(__name__)\n",
1406 |     "    logging.basicConfig(\n",
1407 |     "        format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n",
1408 |     "        datefmt=\"%m/%d/%Y %H:%M:%S\", level=logging.INFO, handlers=[\n",
1409 |     "        logging.FileHandler(f\"log/debug_{accelerator.process_index}.log\"),\n",
1410 |     "        logging.StreamHandler()])\n",
1411 |     "    if accelerator.is_main_process: # We only want to set up logging once\n",
1412 |     "        wandb.init(project=project_name, config=args)\n",
1413 |     "        run_name = wandb.run.name\n",
1414 |     "        tb_writer = SummaryWriter()\n",
1415 |     "        tb_writer.add_hparams(vars(args), {'0': 0})\n",
1416 |     "        logger.setLevel(logging.INFO)\n",
1417 |     "        datasets.utils.logging.set_verbosity_debug()\n",
1418 |     "        transformers.utils.logging.set_verbosity_info()\n",
1419 |     "    else:\n",
1420 |     "        tb_writer = None\n",
1421 |     "        run_name = ''\n",
1422 |     "        logger.setLevel(logging.ERROR)\n",
1423 |     "        datasets.utils.logging.set_verbosity_error()\n",
1424 |     "        transformers.utils.logging.set_verbosity_error()\n",
1425 |     "    return logger, tb_writer, run_name"
1426 |    ]
1427 |   },
1428 |   {
1429 |    "cell_type": "code",
1430 |    "execution_count": null,
1431 |    "metadata": {},
1432 |    "outputs": [],
1433 |    "source": [
1434 |     "def log_metrics(step, metrics):\n",
1435 |     "    logger.info(f\"Step {step}: {metrics}\")\n",
1436 |     "    if accelerator.is_main_process:\n",
1437 |     "        wandb.log(metrics)\n",
1438 |     "        [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]"
1439 |    ]
1440 |   },
1441 |   {
1442 |    "cell_type": "code",
1443 |    "execution_count": null,
1444 |    "metadata": {},
1445 |    "outputs": [
1446 |     {
1447 |      "data": {
1448 |       "application/vnd.jupyter.widget-view+json": {
1449 |        "model_id": "328dc6d7d05c452e8d8e2cab5b4b9c4e",
1450 |        "version_major": 2,
1451 |        "version_minor": 0
1452 |       },
1453 |       "text/plain": [
1454 |        "Check remote data files:   0%|          | 0/183 [00:00<?, ?it/s]"
1455 |       ]
1456 |      },
1457 |      "metadata": {},
1458 |      "output_type": "display_data"
1459 |     },
1460 |     {
1461 |      "name": "stderr",
1462 |      "output_type": "stream",
1463 |      "text": [
1464 |       "Using custom data configuration codeparrot-train-938ce362e6f661b1\n",
1465 |       "Using custom data configuration codeparrot-valid-29167601d8e69487\n"
1466 |      ]
1467 |     }
1468 |    ],
1469 |    "source": [
1470 |     "#hide_output\n",
1471 |     "from torch.utils.data.dataloader import DataLoader\n",
1472 |     "\n",
1473 |     "def create_dataloaders(dataset_name):\n",
1474 |     "    train_data = load_dataset(dataset_name+'-train', split=\"train\",\n",
1475 |     "                              streaming=True)\n",
1476 |     "    train_data = train_data.shuffle(buffer_size=args.shuffle_buffer,\n",
1477 |     "                                    seed=args.seed)\n",
1478 |     "    valid_data = load_dataset(dataset_name+'-valid', split=\"validation\",\n",
1479 |     "                              streaming=True)\n",
1480 |     "    \n",
1481 |     "    train_dataset = ConstantLengthDataset(tokenizer, train_data,\n",
1482 |     "                                          seq_length=args.seq_length)\n",
1483 |     "    valid_dataset = ConstantLengthDataset(tokenizer, valid_data,\n",
1484 |     "                                          seq_length=args.seq_length)\n",
1485 |     "    \n",
1486 |     "    train_dataloader=DataLoader(train_dataset, batch_size=args.train_batch_size)\n",
1487 |     "    eval_dataloader=DataLoader(valid_dataset, batch_size=args.valid_batch_size)\n",
1488 |     "    return train_dataloader, eval_dataloader"
1489 |    ]
1490 |   },
1491 |   {
1492 |    "cell_type": "code",
1493 |    "execution_count": null,
1494 |    "metadata": {},
1495 |    "outputs": [],
1496 |    "source": [
1497 |     "def get_grouped_params(model, no_decay=[\"bias\", \"LayerNorm.weight\"]):\n",
1498 |     "    params_with_wd, params_without_wd = [], []\n",
1499 |     "    for n, p in model.named_parameters():\n",
1500 |     "        if any(nd in n for nd in no_decay):\n",
1501 |     "            params_without_wd.append(p)\n",
1502 |     "        else:\n",
1503 |     "            params_with_wd.append(p)\n",
1504 |     "    return [{'params': params_with_wd, 'weight_decay': args.weight_decay},\n",
1505 |     "            {'params': params_without_wd, 'weight_decay': 0.0}]"
1506 |    ]
1507 |   },
1508 |   {
1509 |    "cell_type": "code",
1510 |    "execution_count": null,
1511 |    "metadata": {},
1512 |    "outputs": [],
1513 |    "source": [
1514 |     "def evaluate():\n",
1515 |     "    model.eval()\n",
1516 |     "    losses = []\n",
1517 |     "    for step, batch in enumerate(eval_dataloader):\n",
1518 |     "        with torch.no_grad():\n",
1519 |     "            outputs = model(batch, labels=batch)\n",
1520 |     "        loss = outputs.loss.repeat(args.valid_batch_size)\n",
1521 |     "        losses.append(accelerator.gather(loss))\n",
1522 |     "        if args.max_eval_steps > 0 and step >= args.max_eval_steps: break\n",
1523 |     "    loss = torch.mean(torch.cat(losses))\n",
1524 |     "    try:\n",
1525 |     "\t\tperplexity = torch.exp(loss)\n",
1526 |     "\texcept OverflowError:\n",
1527 |     "\t\tperplexity = torch.tensor(float(\"inf\"))\n",
1528 |     "    return loss.item(), perplexity.item()"
1529 |    ]
1530 |   },
1531 |   {
1532 |    "cell_type": "code",
1533 |    "execution_count": null,
1534 |    "metadata": {},
1535 |    "outputs": [],
1536 |    "source": [
1537 |     "set_seed(args.seed)\n",
1538 |     "\n",
1539 |     "# Accelerator\n",
1540 |     "accelerator = Accelerator()\n",
1541 |     "samples_per_step = accelerator.state.num_processes * args.train_batch_size\n",
1542 |     "\n",
1543 |     "# Logging\n",
1544 |     "logger, tb_writer, run_name = setup_logging(project_name.split(\"/\")[1])\n",
1545 |     "logger.info(accelerator.state)\n",
1546 |     "\n",
1547 |     "# Load model and tokenizer\n",
1548 |     "if accelerator.is_main_process:\n",
1549 |     "    hf_repo = Repository(\"./\", clone_from=project_name, revision=run_name)\n",
1550 |     "model = AutoModelForCausalLM.from_pretrained(\"./\", gradient_checkpointing=True)\n",
1551 |     "tokenizer = AutoTokenizer.from_pretrained(\"./\")\n",
1552 |     "\n",
1553 |     "# Load dataset and dataloader\n",
1554 |     "train_dataloader, eval_dataloader = create_dataloaders(dataset_name)\n",
1555 |     "\n",
1556 |     "# Prepare the optimizer and learning rate scheduler\n",
1557 |     "optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)\n",
1558 |     "lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer,\n",
1559 |     "                             num_warmup_steps=args.num_warmup_steps,\n",
1560 |     "                             num_training_steps=args.max_train_steps,)\n",
1561 |     "def get_lr():\n",
1562 |     "    return optimizer.param_groups[0]['lr']\n",
1563 |     "\n",
1564 |     "# Prepare everything with our `accelerator` (order of args is not important)\n",
1565 |     "model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(\n",
1566 |     "    model, optimizer, train_dataloader, eval_dataloader)\n",
1567 |     "\n",
1568 |     "# Train model\n",
1569 |     "model.train()\n",
1570 |     "completed_steps = 0\n",
1571 |     "for step, batch in enumerate(train_dataloader, start=1):\n",
1572 |     "    loss = model(batch, labels=batch).loss\n",
1573 |     "    log_metrics(step, {'lr': get_lr(), 'samples': step*samples_per_step,\n",
1574 |     "                       'steps': completed_steps, 'loss/train': loss.item()})\n",
1575 |     "    loss = loss / args.gradient_accumulation_steps\n",
1576 |     "    accelerator.backward(loss)\n",
1577 |     "    if step % args.gradient_accumulation_steps == 0:\n",
1578 |     "        optimizer.step()\n",
1579 |     "        lr_scheduler.step()\n",
1580 |     "        optimizer.zero_grad()\n",
1581 |     "        completed_steps += 1\n",
1582 |     "    if step % args.save_checkpoint_steps == 0:\n",
1583 |     "        logger.info('Evaluating and saving model checkpoint')\n",
1584 |     "        eval_loss, perplexity = evaluate()\n",
1585 |     "        log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})\n",
1586 |     "        accelerator.wait_for_everyone()\n",
1587 |     "        unwrapped_model = accelerator.unwrap_model(model)\n",
1588 |     "        if accelerator.is_main_process:\n",
1589 |     "            unwrapped_model.save_pretrained(\"./\")\n",
1590 |     "            hf_repo.push_to_hub(commit_message=f'step {step}')\n",
1591 |     "        model.train()\n",
1592 |     "    if completed_steps >= args.max_train_steps:\n",
1593 |     "        break\n",
1594 |     "\n",
1595 |     "# Evaluate and save the last checkpoint\n",
1596 |     "logger.info('Evaluating and saving model after training')\n",
1597 |     "eval_loss, perplexity = evaluate()\n",
1598 |     "log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})\n",
1599 |     "accelerator.wait_for_everyone()\n",
1600 |     "unwrapped_model = accelerator.unwrap_model(model)\n",
1601 |     "if accelerator.is_main_process:\n",
1602 |     "    unwrapped_model.save_pretrained(\"./\")\n",
1603 |     "    hf_repo.push_to_hub(commit_message=f'final model')"
1604 |    ]
1605 |   },
1606 |   {
1607 |    "cell_type": "markdown",
1608 |    "metadata": {},
1609 |    "source": [
1610 |     "<img alt=\"DDP\" caption=\"Illustration of the processing steps in DDP with four GPUs\" src=\"images/chapter10_ddp.png\" id=\"ddp\"/>"
1611 |    ]
1612 |   },
1613 |   {
1614 |    "cell_type": "markdown",
1615 |    "metadata": {},
1616 |    "source": [
1617 |     "### The Training Run"
1618 |    ]
1619 |   },
1620 |   {
1621 |    "cell_type": "markdown",
1622 |    "metadata": {},
1623 |    "source": [
1624 |     "## Results and Analysis"
1625 |    ]
1626 |   },
1627 |   {
1628 |    "cell_type": "code",
1629 |    "execution_count": null,
1630 |    "metadata": {},
1631 |    "outputs": [
1632 |     {
1633 |      "name": "stderr",
1634 |      "output_type": "stream",
1635 |      "text": [
1636 |       "2021-10-20 18:29:01.107727: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
1637 |       "2021-10-20 18:29:01.107759: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
1638 |      ]
1639 |     }
1640 |    ],
1641 |    "source": [
1642 |     "#hide_output\n",
1643 |     "from transformers import pipeline, set_seed\n",
1644 |     "\n",
1645 |     "model_ckpt = 'transformersbook/codeparrot-small'\n",
1646 |     "generation = pipeline('text-generation', model=model_ckpt, device=0)"
1647 |    ]
1648 |   },
1649 |   {
1650 |    "cell_type": "code",
1651 |    "execution_count": null,
1652 |    "metadata": {},
1653 |    "outputs": [],
1654 |    "source": [
1655 |     "import re\n",
1656 |     "from transformers import set_seed \n",
1657 |     "\n",
1658 |     "def first_block(string):\n",
1659 |     "    return re.split('\\nclass|\\ndef|\\n#|\\n@|\\nprint|\\nif', string)[0].rstrip()\n",
1660 |     "\n",
1661 |     "def complete_code(pipe, prompt, max_length=64, num_completions=4, seed=1):\n",
1662 |     "    set_seed(seed)\n",
1663 |     "    gen_kwargs = {\"temperature\":0.4, \"top_p\":0.95, \"top_k\":0, \"num_beams\":1,\n",
1664 |     "                  \"do_sample\":True,}\n",
1665 |     "    code_gens = generation(prompt, num_return_sequences=num_completions, \n",
1666 |     "                            max_length=max_length, **gen_kwargs)\n",
1667 |     "    code_strings = []\n",
1668 |     "    for code_gen in code_gens:\n",
1669 |     "        generated_code = first_block(code_gen['generated_text'][len(prompt):])\n",
1670 |     "        code_strings.append(generated_code)\n",
1671 |     "    print(('\\n'+'='*80 + '\\n').join(code_strings))"
1672 |    ]
1673 |   },
1674 |   {
1675 |    "cell_type": "code",
1676 |    "execution_count": null,
1677 |    "metadata": {},
1678 |    "outputs": [
1679 |     {
1680 |      "name": "stdout",
1681 |      "output_type": "stream",
1682 |      "text": [
1683 |       "\n",
1684 |       "    return math.sqrt(a * b)\n",
1685 |       "================================================================================\n",
1686 |       "\n",
1687 |       "    return a * b / 2.0\n",
1688 |       "================================================================================\n",
1689 |       "\n",
1690 |       "    return a * b\n",
1691 |       "================================================================================\n",
1692 |       "\n",
1693 |       "    return a * b / a\n"
1694 |      ]
1695 |     }
1696 |    ],
1697 |    "source": [
1698 |     "prompt = '''def area_of_rectangle(a: float, b: float):\n",
1699 |     "    \"\"\"Return the area of the rectangle.\"\"\"'''\n",
1700 |     "complete_code(generation, prompt)"
1701 |    ]
1702 |   },
1703 |   {
1704 |    "cell_type": "code",
1705 |    "execution_count": null,
1706 |    "metadata": {},
1707 |    "outputs": [
1708 |     {
1709 |      "name": "stdout",
1710 |      "output_type": "stream",
1711 |      "text": [
1712 |       "\n",
1713 |       "    if not html:\n",
1714 |       "        return []\n",
1715 |       "    return [url for url in re.findall(r'<a href=\"(/[^/]+/[^\"]+?)\">', html)]\n",
1716 |       "================================================================================\n",
1717 |       "\n",
1718 |       "    return [url for url in re.findall(r'<a href=\"(.*?)\"', html)\n",
1719 |       "            if url]\n",
1720 |       "================================================================================\n",
1721 |       "\n",
1722 |       "    return [url for url in re.findall(r'<a href=\"(/.*)\",', html)]\n",
1723 |       "================================================================================\n",
1724 |       "\n",
1725 |       "    return re.findall(r'<a href=\"(.*?)\" class=\"url\"[^>]*>', html)\n"
1726 |      ]
1727 |     }
1728 |    ],
1729 |    "source": [
1730 |     "prompt = '''def get_urls_from_html(html):\n",
1731 |     "    \"\"\"Get all embedded URLs in a HTML string.\"\"\"'''\n",
1732 |     "complete_code(generation, prompt)"
1733 |    ]
1734 |   },
1735 |   {
1736 |    "cell_type": "code",
1737 |    "execution_count": null,
1738 |    "metadata": {},
1739 |    "outputs": [
1740 |     {
1741 |      "name": "stdout",
1742 |      "output_type": "stream",
1743 |      "text": [
1744 |       "https://github.com/huggingface/transformers | /allenai | /facebook |\n",
1745 |       "/asteroid-team | /google | /amazon | /speechbrain | /microsoft | /grammarly |\n",
1746 |       "/models | /inference-api | /distilbert-base-uncased |\n",
1747 |       "/dbmdz/bert-large-cased-finetuned-conll03-english |\n",
1748 |       "https://huggingface.co/transformers | https://arxiv.org/abs/1811.06031 |\n",
1749 |       "https://arxiv.org/abs/1803.10631 | https://transformer.huggingface.co/ | /coref\n",
1750 |       "| https://medium.com/huggingface/distilbert-8cf3380435b5\n"
1751 |      ]
1752 |     }
1753 |    ],
1754 |    "source": [
1755 |     "import requests\n",
1756 |     "\n",
1757 |     "def get_urls_from_html(html):\n",
1758 |     "    return [url for url in re.findall(r'<a href=\"(.*?)\"', html) if url]\n",
1759 |     "\n",
1760 |     "print(\" | \".join(get_urls_from_html(requests.get('https://hf.co/').text)))"
1761 |    ]
1762 |   },
1763 |   {
1764 |    "cell_type": "markdown",
1765 |    "metadata": {},
1766 |    "source": [
1767 |     "> **NOTE**: In the following code block, a large GPT-2 checkpoint is loaded into memory. On platforms like Colab and Kaggle, this can cause the instance to crash due to insufficient RAM or GPU memory. You can still run the example if you replace the large model with the small one by using `model_ckpt = \"transformersbook/codeparrot-small\"`.\n",
1768 |     " "
1769 |    ]
1770 |   },
1771 |   {
1772 |    "cell_type": "code",
1773 |    "execution_count": null,
1774 |    "metadata": {},
1775 |    "outputs": [
1776 |     {
1777 |      "name": "stderr",
1778 |      "output_type": "stream",
1779 |      "text": [
1780 |       "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n"
1781 |      ]
1782 |     },
1783 |     {
1784 |      "name": "stdout",
1785 |      "output_type": "stream",
1786 |      "text": [
1787 |       "\n",
1788 |       "    return np.mean(a)\n",
1789 |       "================================================================================\n",
1790 |       "\n",
1791 |       "    return np.mean(a)\n",
1792 |       "================================================================================\n",
1793 |       "\n",
1794 |       "    return np.mean(a)\n",
1795 |       "================================================================================\n",
1796 |       "\n",
1797 |       "    return np.mean(a)\n"
1798 |      ]
1799 |     }
1800 |    ],
1801 |    "source": [
1802 |     "model_ckpt = 'transformersbook/codeparrot'\n",
1803 |     "generation = pipeline('text-generation', model=model_ckpt, device=0)\n",
1804 |     "\n",
1805 |     "prompt = '''# a function in native python:\n",
1806 |     "def mean(a):\n",
1807 |     "    return sum(a)/len(a)\n",
1808 |     "\n",
1809 |     "# the same function using numpy:\n",
1810 |     "import numpy as np\n",
1811 |     "def mean(a):'''\n",
1812 |     "complete_code(generation, prompt, max_length=64)"
1813 |    ]
1814 |   },
1815 |   {
1816 |    "cell_type": "code",
1817 |    "execution_count": null,
1818 |    "metadata": {},
1819 |    "outputs": [
1820 |     {
1821 |      "name": "stderr",
1822 |      "output_type": "stream",
1823 |      "text": [
1824 |       "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n"
1825 |      ]
1826 |     },
1827 |     {
1828 |      "name": "stdout",
1829 |      "output_type": "stream",
1830 |      "text": [
1831 |       "\n",
1832 |       "reg = DummyRegressor()\n",
1833 |       "\n",
1834 |       "forest = RandomForestClassifier(n_estimators=20)\n",
1835 |       "\n",
1836 |       "forest.fit(X, y)\n",
1837 |       "================================================================================\n",
1838 |       "\n",
1839 |       "clf = ExtraTreesClassifier(n_estimators=100, max_features='sqrt')\n",
1840 |       "clf.fit(X, y)\n",
1841 |       "================================================================================\n",
1842 |       "\n",
1843 |       "clf = RandomForestClassifier(n_estimators=20, n_jobs=n_jobs, random_state=1)\n",
1844 |       "clf.fit(X, y)\n",
1845 |       "================================================================================\n",
1846 |       "\n",
1847 |       "clf = RandomForestClassifier(n_estimators=20)\n",
1848 |       "clf.fit(X, y)\n"
1849 |      ]
1850 |     }
1851 |    ],
1852 |    "source": [
1853 |     "prompt = '''X = np.random.randn(100, 100)\n",
1854 |     "y = np.random.randint(0, 1, 100)\n",
1855 |     "\n",
1856 |     "# fit random forest classifier with 20 estimators'''\n",
1857 |     "complete_code(generation, prompt, max_length=96)"
1858 |    ]
1859 |   },
1860 |   {
1861 |    "cell_type": "markdown",
1862 |    "metadata": {},
1863 |    "source": [
1864 |     "## Conclusion"
1865 |    ]
1866 |   },
1867 |   {
1868 |    "cell_type": "code",
1869 |    "execution_count": null,
1870 |    "metadata": {},
1871 |    "outputs": [],
1872 |    "source": []
1873 |   }
1874 |  ],
1875 |  "metadata": {
1876 |   "kernelspec": {
1877 |    "display_name": "Python 3 (ipykernel)",
1878 |    "language": "python",
1879 |    "name": "python3"
1880 |   }
1881 |  },
1882 |  "nbformat": 4,
1883 |  "nbformat_minor": 4
1884 | }
1885 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Transformers Notebooks
 2 | 
 3 | This repository contains the example code from our O'Reilly book [Natural Language Processing with Transformers](https://www.oreilly.com/library/view/natural-language-processing/9781098136789/):
 4 | 
 5 | <img alt="book-cover" height=200 src="images/book_cover.jpg" id="book-cover"/>
 6 | 
 7 | ## Getting started
 8 | 
 9 | You can run these notebooks on cloud platforms like [Google Colab](https://colab.research.google.com/) or your local machine. Note that most chapters require a GPU to run in a reasonable amount of time, so we recommend one of the cloud platforms as they come pre-installed with CUDA.
10 | 
11 | ### Running on a cloud platform
12 | 
13 | To run these notebooks on a cloud platform, just click on one of the badges in the table below:
14 | 
15 | <!--This table is automatically generated, do not fill manually!-->
16 | 
17 | 
18 | 
19 | | Chapter                                     | Colab                                                                                                                                                                                               | Kaggle                                                                                                                                                                                                   | Gradient                                                                                                                                                                               | Studio Lab                                                                                                                                                                                                   |
20 | |:--------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
21 | | Introduction                                | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/01_introduction.ipynb)              | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/01_introduction.ipynb)              | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/01_introduction.ipynb)              | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/01_introduction.ipynb)              |
22 | | Text Classification                         | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/02_classification.ipynb)            | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/02_classification.ipynb)            | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/02_classification.ipynb)            | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/02_classification.ipynb)            |
23 | | Transformer Anatomy                         | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/03_transformer-anatomy.ipynb)       | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/03_transformer-anatomy.ipynb)       | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/03_transformer-anatomy.ipynb)       | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/03_transformer-anatomy.ipynb)       |
24 | | Multilingual Named Entity Recognition       | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/04_multilingual-ner.ipynb)          | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/04_multilingual-ner.ipynb)          | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/04_multilingual-ner.ipynb)          | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/04_multilingual-ner.ipynb)          |
25 | | Text Generation                             | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/05_text-generation.ipynb)           | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/05_text-generation.ipynb)           | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/05_text-generation.ipynb)           | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/05_text-generation.ipynb)           |
26 | | Summarization                               | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/06_summarization.ipynb)             | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/06_summarization.ipynb)             | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/06_summarization.ipynb)             | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/06_summarization.ipynb)             |
27 | | Question Answering                          | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/07_question-answering.ipynb)        | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/07_question-answering.ipynb)        | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/07_question-answering.ipynb)        | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/07_question-answering.ipynb)        |
28 | | Making Transformers Efficient in Production | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/08_model-compression.ipynb)         | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/08_model-compression.ipynb)         | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/08_model-compression.ipynb)         | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/08_model-compression.ipynb)         |
29 | | Dealing with Few to No Labels               | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/09_few-to-no-labels.ipynb)          | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/09_few-to-no-labels.ipynb)          | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/09_few-to-no-labels.ipynb)          | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/09_few-to-no-labels.ipynb)          |
30 | | Training Transformers from Scratch          | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/10_transformers-from-scratch.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/10_transformers-from-scratch.ipynb) | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/10_transformers-from-scratch.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/10_transformers-from-scratch.ipynb) |
31 | | Future Directions                           | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/11_future-directions.ipynb)         | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/11_future-directions.ipynb)         | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/11_future-directions.ipynb)         | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/11_future-directions.ipynb)         |
32 | 
33 | <!--End of table-->
34 | 
35 | Nowadays, the GPUs on Colab tend to be K80s (which have limited memory), so we recommend using [Kaggle](https://www.kaggle.com/docs/notebooks), [Gradient](https://gradient.run/notebooks), or [SageMaker Studio Lab](https://studiolab.sagemaker.aws/). These platforms tend to provide more performant GPUs like P100s, all for free!
36 | 
37 | > Note: some cloud platforms like Kaggle require you to restart the notebook after installing new packages.
38 | 
39 | ### Running on your machine
40 | 
41 | To run the notebooks on your own machine, first clone the repository and navigate to it:
42 | 
43 | ```bash
44 | $ git clone https://github.com/nlp-with-transformers/notebooks.git
45 | $ cd notebooks
46 | ```
47 | 
48 | Next, run the following command to create a `conda` virtual environment that contains all the libraries needed to run the notebooks:
49 | 
50 | ```bash
51 | $ conda env create -f environment.yml
52 | ```
53 | 
54 | > Note: You'll need a GPU that supports NVIDIA's [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) to build the environment. Currently, this means you cannot build locally on Apple silicon 😢.
55 | 
56 | Chapter 7 (Question Answering) has a special set of dependencies, so to run that chapter you'll need a separate environment:
57 | 
58 | ```bash
59 | $ conda env create -f environment-chapter7.yml
60 | ```
61 | 
62 | Once you've installed the dependencies, you can activate the `conda` environment and spin up the notebooks as follows:
63 | 
64 | ```bash
65 | $ conda activate book # or conda activate book-chapter7
66 | $ jupyter notebook
67 | ```
68 | 
69 | ## FAQ
70 | 
71 | ### When trying to clone the notebooks on Kaggle I get a message that I am unable to access the book's Github repository. How can I solve this issue?
72 | 
73 | This issue is likely due to a missing internet connection. When running your first notebook on Kaggle you need to enable internet access in the settings menu on the right side. 
74 | 
75 | ### How do you select a GPU on Kaggle?
76 | 
77 | You can enable GPU usage by selecting *GPU* as *Accelerator* in the settings menu on the right side.
78 | 
79 | ## Citations
80 | 
81 | If you'd like to cite this book, you can use the following BibTeX entry:
82 | 
83 | ```
84 | @book{tunstall2022natural,
85 |   title={Natural Language Processing with Transformers: Building Language Applications with Hugging Face},
86 |   author={Tunstall, Lewis and von Werra, Leandro and Wolf, Thomas},
87 |   isbn={1098103246},
88 |   url={https://books.google.ch/books?id=7hhyzgEACAAJ},
89 |   year={2022},
90 |   publisher={O'Reilly Media, Incorporated}
91 | }
92 | ```
93 | 


--------------------------------------------------------------------------------
/SageMaker/01_introduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%%capture\n",
 10 |     "%pip install datasets transformers[tf,torch,sentencepiece,vision,optuna,sklearn,onnxruntime]==4.11.3"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "#hide\n",
 20 |     "from utils import *\n",
 21 |     "setup_chapter()"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "In this book we will demonstrate how you can run the example from the book in Amazon SageMaker. \n",
 29 |     "\n",
 30 |     "The SageMaker notebook uses an AWS IAM role to access AWS resources such as Amazon S3 bucket.\n",
 31 |     "You created this role during the notebook creation process described in the README.md in SageMaker/README.md.\n",
 32 |     "In the AWS IAM service you are able to review the access policy and you can modify it.\n",
 33 |     "\n",
 34 |     "In the next cell we will check an Amazon S3 bucket exists and create a new one if not. In addition we'll get the SageMaker role and session."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import sagemaker.huggingface\n",
 44 |     "import sagemaker\n",
 45 |     "\n",
 46 |     "sess = sagemaker.Session()\n",
 47 |     "# sagemaker session bucket -> used for uploading data, models and logs\n",
 48 |     "# sagemaker will automatically create this bucket if it not exists\n",
 49 |     "sagemaker_session_bucket=None\n",
 50 |     "if sagemaker_session_bucket is None and sess is not None:\n",
 51 |     "    # set to default bucket if a bucket name is not given\n",
 52 |     "    sagemaker_session_bucket = sess.default_bucket()\n",
 53 |     "\n",
 54 |     "role = sagemaker.get_execution_role()\n",
 55 |     "sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n",
 56 |     "\n",
 57 |     "print(f\"sagemaker role arn: {role}\")\n",
 58 |     "print(f\"sagemaker bucket: {sess.default_bucket()}\")\n",
 59 |     "print(f\"sagemaker session region: {sess.boto_region_name}\")"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "Now we setup a helper function to easily deploy any Hugging Face model as an endpoint on AWS SageMaker.\n",
 67 |     "We use the following function to create a HuggingFaceModel Class, where we are going to download the model from the Hugging Face hub. This class would also allow to use a trained model stored on the Amazon S3 bucket. \n",
 68 |     "Next, an endpoint will be created and this endpoint will host your model. Based on the model requirements you can choose a specific instance type which are equipped differently in memory, cpu, gpu. There are different inferences available, such as real-time, asynchronous or serverless.\n",
 69 |     "If you are not sure which inference works best for your model, you use Amazon SageMaker Inference Recommender.\n",
 70 |     "https://docs.aws.amazon.com/sagemaker/latest/dg/deploy-model.html \n",
 71 |     "\n",
 72 |     "To view all options see the documentation: https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/index.html \n",
 73 |     "\n",
 74 |     "Depending on Transformer version, PyTorch/TensorFlow version and Python version, the mapping for the Hugging Face Model Class can be found here: https://huggingface.co/docs/sagemaker/reference#inference-dlc-overview \n",
 75 |     "\n",
 76 |     "To find the endpoints in the AWS Console navigate to https://console.aws.amazon.com/sagemaker/home#/endpoints \n",
 77 |     "\n",
 78 |     "Make sure to finish this notebook to delete the endpoint in the end. "
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "from sagemaker.huggingface.model import HuggingFaceModel\n",
 88 |     "\n",
 89 |     "def setup_endpoint(model_name, task_name):\n",
 90 |     "    # Hub Model configuration. <https://huggingface.co/models>\n",
 91 |     "    hub = {\n",
 92 |     "      'HF_MODEL_ID': model_name, # model_id from hf.co/models\n",
 93 |     "      'HF_TASK': task_name # NLP task you want to use for predictions\n",
 94 |     "    }\n",
 95 |     "\n",
 96 |     "    # create Hugging Face Model Class\n",
 97 |     "    huggingface_model = HuggingFaceModel(\n",
 98 |     "       env=hub, # configuration for loading model from Hub\n",
 99 |     "       role=role, # iam role with permissions to create an Endpoint\n",
100 |     "       transformers_version=\"4.17.0\", # transformers version used\n",
101 |     "       pytorch_version=\"1.10.2\", # pytorch version used\n",
102 |     "       py_version=\"py38\" # python version used\n",
103 |     "    )\n",
104 |     "\n",
105 |     "    # deploy model to SageMaker Inference\n",
106 |     "    predictor = huggingface_model.deploy(\n",
107 |     "       initial_instance_count=1, # how many instances used\n",
108 |     "       instance_type=\"ml.m5.xlarge\" # instance type\n",
109 |     "    )\n",
110 |     "    return predictor"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "# Hello Transformers"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "## The Encoder-Decoder Framework"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "## Attention Mechanisms"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "## Transfer Learning in NLP"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "## Hugging Face Transformers: Bridging the Gap"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "## A Tour of Transformer Applications"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "text = \"\"\"Dear Amazon, last week I ordered an Optimus Prime action figure \\\n",
162 |     "from your online store in Germany. Unfortunately, when I opened the package, \\\n",
163 |     "I discovered to my horror that I had been sent an action figure of Megatron \\\n",
164 |     "instead! As a lifelong enemy of the Decepticons, I hope you can understand my \\\n",
165 |     "dilemma. To resolve the issue, I demand an exchange of Megatron for the \\\n",
166 |     "Optimus Prime figure I ordered. Enclosed are copies of my records concerning \\\n",
167 |     "this purchase. I expect to hear from you soon. Sincerely, Bumblebee.\"\"\""
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "### Text Classification"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "predictor = setup_endpoint('distilbert-base-uncased-finetuned-sst-2-english', 'text-classification')"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "# example request, you always need to define \"inputs\"\n",
193 |     "import pandas as pd\n",
194 |     "\n",
195 |     "# request\n",
196 |     "outputs = predictor.predict({\"inputs\": text})\n",
197 |     "pd.DataFrame(outputs)    "
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "predictor.delete_endpoint()"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "### Named Entity Recognition"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "predictor = setup_endpoint(\"dbmdz/bert-large-cased-finetuned-conll03-english\", \"ner\")"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "outputs = predictor.predict({\"inputs\": text, \"parameters\": {\"aggregation_strategy\": \"simple\"}})\n",
232 |     "pd.DataFrame(outputs)    "
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "predictor.delete_endpoint()"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "### Question Answering "
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "predictor = setup_endpoint(\"distilbert-base-cased-distilled-squad\", 'question-answering')"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "question = \"What does the customer want?\"\n",
267 |     "\n",
268 |     "outputs = predictor.predict({\"inputs\": {\n",
269 |     "    \"question\": question,\n",
270 |     "    \"context\": text\n",
271 |     "    }\n",
272 |     "})\n",
273 |     "\n",
274 |     "pd.DataFrame([outputs])    "
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "predictor.delete_endpoint()"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "metadata": {},
289 |    "source": [
290 |     "### Summarization"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": null,
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "predictor = setup_endpoint(\"sshleifer/distilbart-cnn-12-6\", 'summarization')"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": null,
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "outputs = predictor.predict({\"inputs\": text,\n",
309 |     "                             \"parameters\": {\n",
310 |     "                                 \"max_length\":45,\n",
311 |     "                                 \"clean_up_tokenization_spaces\":True\n",
312 |     "                                 }\n",
313 |     "                            })\n",
314 |     "print(outputs[0]['summary_text'])"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": [
323 |     "predictor.delete_endpoint()"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "markdown",
328 |    "metadata": {},
329 |    "source": [
330 |     "### Translation"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "predictor = setup_endpoint(\"Helsinki-NLP/opus-mt-en-de\", \"translation\")"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "outputs = predictor.predict({\"inputs\": text,\n",
349 |     "                             \"parameters\": {\n",
350 |     "                                 \"min_length\":100,\n",
351 |     "                                 \"clean_up_tokenization_spaces\":True\n",
352 |     "                                 }\n",
353 |     "                            })\n",
354 |     "print(outputs[0]['translation_text'])"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": null,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "predictor.delete_endpoint()"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "### Text Generation"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": null,
376 |    "metadata": {},
377 |    "outputs": [],
378 |    "source": [
379 |     "predictor = setup_endpoint(\"gpt2\", 'text-generation')"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": null,
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "response = \"Dear Bumblebee, I am sorry to hear that your order was mixed up.\"\n",
389 |     "prompt = text + \"\\n\\nCustomer service response:\\n\" + response\n",
390 |     "\n",
391 |     "outputs = predictor.predict({\"inputs\": prompt,\n",
392 |     "                             \"parameters\": {\n",
393 |     "                                 \"max_length\":200\n",
394 |     "                                 }\n",
395 |     "                            })\n",
396 |     "print(outputs[0]['generated_text'])"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": [
405 |     "predictor.delete_endpoint()"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "markdown",
410 |    "metadata": {},
411 |    "source": [
412 |     "## The Hugging Face Ecosystem"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "markdown",
417 |    "metadata": {},
418 |    "source": [
419 |     "### The Hugging Face Hub"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "markdown",
424 |    "metadata": {},
425 |    "source": [
426 |     "### Hugging Face Tokenizers"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "metadata": {},
432 |    "source": [
433 |     "### Hugging Face Datasets"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "markdown",
438 |    "metadata": {},
439 |    "source": [
440 |     "### Hugging Face Accelerate"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "markdown",
445 |    "metadata": {},
446 |    "source": [
447 |     "## Main Challenges with Transformers"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {},
453 |    "source": [
454 |     "## Conclusion"
455 |    ]
456 |   }
457 |  ],
458 |  "metadata": {
459 |   "kernelspec": {
460 |    "display_name": "Python 3.9.13 64-bit",
461 |    "language": "python",
462 |    "name": "python3"
463 |   },
464 |   "language_info": {
465 |    "codemirror_mode": {
466 |     "name": "ipython",
467 |     "version": 3
468 |    },
469 |    "file_extension": ".py",
470 |    "mimetype": "text/x-python",
471 |    "name": "python",
472 |    "nbconvert_exporter": "python",
473 |    "pygments_lexer": "ipython3",
474 |    "version": "3.9.13"
475 |   },
476 |   "vscode": {
477 |    "interpreter": {
478 |     "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
479 |    }
480 |   }
481 |  },
482 |  "nbformat": 4,
483 |  "nbformat_minor": 4
484 | }
485 | 


--------------------------------------------------------------------------------
/SageMaker/02_classification.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": null,
   6 |    "metadata": {
   7 |     "scrolled": true
   8 |    },
   9 |    "outputs": [],
  10 |    "source": [
  11 |     "%%capture\n",
  12 |     "%pip install datasets[audio]==1.16.1 umap-learn==0.5.1 datasets[s3] transformers[tf,torch,sentencepiece,vision,optuna,sklearn,onnxruntime]==4.11.3"
  13 |    ]
  14 |   },
  15 |   {
  16 |    "cell_type": "code",
  17 |    "execution_count": null,
  18 |    "metadata": {},
  19 |    "outputs": [],
  20 |    "source": [
  21 |     "#hide\n",
  22 |     "from utils import *\n",
  23 |     "setup_chapter()"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "markdown",
  28 |    "metadata": {},
  29 |    "source": [
  30 |     "# Text Classification"
  31 |    ]
  32 |   },
  33 |   {
  34 |    "cell_type": "markdown",
  35 |    "metadata": {},
  36 |    "source": [
  37 |     "## The Dataset"
  38 |    ]
  39 |   },
  40 |   {
  41 |    "cell_type": "markdown",
  42 |    "metadata": {},
  43 |    "source": [
  44 |     "### A First Look at Hugging Face Datasets"
  45 |    ]
  46 |   },
  47 |   {
  48 |    "cell_type": "code",
  49 |    "execution_count": null,
  50 |    "metadata": {},
  51 |    "outputs": [],
  52 |    "source": [
  53 |     "from datasets import list_datasets\n",
  54 |     "\n",
  55 |     "all_datasets = list_datasets()\n",
  56 |     "print(f\"There are {len(all_datasets)} datasets currently available on the Hub\")\n",
  57 |     "print(f\"The first 10 are: {all_datasets[:10]}\")"
  58 |    ]
  59 |   },
  60 |   {
  61 |    "cell_type": "code",
  62 |    "execution_count": null,
  63 |    "metadata": {},
  64 |    "outputs": [],
  65 |    "source": [
  66 |     "# hide_output\n",
  67 |     "from datasets import load_dataset\n",
  68 |     "\n",
  69 |     "emotions = load_dataset(\"emotion\")"
  70 |    ]
  71 |   },
  72 |   {
  73 |    "cell_type": "code",
  74 |    "execution_count": null,
  75 |    "metadata": {},
  76 |    "outputs": [],
  77 |    "source": [
  78 |     "emotions"
  79 |    ]
  80 |   },
  81 |   {
  82 |    "cell_type": "code",
  83 |    "execution_count": null,
  84 |    "metadata": {},
  85 |    "outputs": [],
  86 |    "source": [
  87 |     "train_ds = emotions[\"train\"]\n",
  88 |     "train_ds"
  89 |    ]
  90 |   },
  91 |   {
  92 |    "cell_type": "code",
  93 |    "execution_count": null,
  94 |    "metadata": {},
  95 |    "outputs": [],
  96 |    "source": [
  97 |     "len(train_ds)"
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "code",
 102 |    "execution_count": null,
 103 |    "metadata": {},
 104 |    "outputs": [],
 105 |    "source": [
 106 |     "train_ds[0]"
 107 |    ]
 108 |   },
 109 |   {
 110 |    "cell_type": "code",
 111 |    "execution_count": null,
 112 |    "metadata": {},
 113 |    "outputs": [],
 114 |    "source": [
 115 |     "train_ds.column_names"
 116 |    ]
 117 |   },
 118 |   {
 119 |    "cell_type": "code",
 120 |    "execution_count": null,
 121 |    "metadata": {},
 122 |    "outputs": [],
 123 |    "source": [
 124 |     "print(train_ds.features)"
 125 |    ]
 126 |   },
 127 |   {
 128 |    "cell_type": "code",
 129 |    "execution_count": null,
 130 |    "metadata": {},
 131 |    "outputs": [],
 132 |    "source": [
 133 |     "print(train_ds[:5])"
 134 |    ]
 135 |   },
 136 |   {
 137 |    "cell_type": "code",
 138 |    "execution_count": null,
 139 |    "metadata": {},
 140 |    "outputs": [],
 141 |    "source": [
 142 |     "print(train_ds[\"text\"][:5])"
 143 |    ]
 144 |   },
 145 |   {
 146 |    "cell_type": "markdown",
 147 |    "metadata": {},
 148 |    "source": [
 149 |     "### From Datasets to DataFrames"
 150 |    ]
 151 |   },
 152 |   {
 153 |    "cell_type": "code",
 154 |    "execution_count": null,
 155 |    "metadata": {},
 156 |    "outputs": [],
 157 |    "source": [
 158 |     "import pandas as pd\n",
 159 |     "\n",
 160 |     "emotions.set_format(type=\"pandas\")\n",
 161 |     "df = emotions[\"train\"][:]\n",
 162 |     "df.head()"
 163 |    ]
 164 |   },
 165 |   {
 166 |    "cell_type": "code",
 167 |    "execution_count": null,
 168 |    "metadata": {},
 169 |    "outputs": [],
 170 |    "source": [
 171 |     "def label_int2str(row):\n",
 172 |     "    return emotions[\"train\"].features[\"label\"].int2str(row)\n",
 173 |     "\n",
 174 |     "df[\"label_name\"] = df[\"label\"].apply(label_int2str)\n",
 175 |     "df.head()"
 176 |    ]
 177 |   },
 178 |   {
 179 |    "cell_type": "markdown",
 180 |    "metadata": {},
 181 |    "source": [
 182 |     "### Looking at the Class Distribution"
 183 |    ]
 184 |   },
 185 |   {
 186 |    "cell_type": "code",
 187 |    "execution_count": null,
 188 |    "metadata": {},
 189 |    "outputs": [],
 190 |    "source": [
 191 |     "import matplotlib.pyplot as plt\n",
 192 |     "\n",
 193 |     "df[\"label_name\"].value_counts(ascending=True).plot.barh()\n",
 194 |     "plt.title(\"Frequency of Classes\")\n",
 195 |     "plt.show()"
 196 |    ]
 197 |   },
 198 |   {
 199 |    "cell_type": "markdown",
 200 |    "metadata": {},
 201 |    "source": [
 202 |     "### How Long Are Our Tweets?"
 203 |    ]
 204 |   },
 205 |   {
 206 |    "cell_type": "code",
 207 |    "execution_count": null,
 208 |    "metadata": {},
 209 |    "outputs": [],
 210 |    "source": [
 211 |     "df[\"Words Per Tweet\"] = df[\"text\"].str.split().apply(len)\n",
 212 |     "df.boxplot(\"Words Per Tweet\", by=\"label_name\", grid=False, showfliers=False,\n",
 213 |     "           color=\"black\")\n",
 214 |     "plt.suptitle(\"\")\n",
 215 |     "plt.xlabel(\"\")\n",
 216 |     "plt.show()"
 217 |    ]
 218 |   },
 219 |   {
 220 |    "cell_type": "code",
 221 |    "execution_count": null,
 222 |    "metadata": {},
 223 |    "outputs": [],
 224 |    "source": [
 225 |     "emotions.reset_format()"
 226 |    ]
 227 |   },
 228 |   {
 229 |    "cell_type": "markdown",
 230 |    "metadata": {},
 231 |    "source": [
 232 |     "## From Text to Tokens"
 233 |    ]
 234 |   },
 235 |   {
 236 |    "cell_type": "markdown",
 237 |    "metadata": {},
 238 |    "source": [
 239 |     "### Character Tokenization"
 240 |    ]
 241 |   },
 242 |   {
 243 |    "cell_type": "code",
 244 |    "execution_count": null,
 245 |    "metadata": {},
 246 |    "outputs": [],
 247 |    "source": [
 248 |     "text = \"Tokenizing text is a core task of NLP.\"\n",
 249 |     "tokenized_text = list(text)\n",
 250 |     "print(tokenized_text)"
 251 |    ]
 252 |   },
 253 |   {
 254 |    "cell_type": "code",
 255 |    "execution_count": null,
 256 |    "metadata": {},
 257 |    "outputs": [],
 258 |    "source": [
 259 |     "token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_text)))}\n",
 260 |     "print(token2idx)"
 261 |    ]
 262 |   },
 263 |   {
 264 |    "cell_type": "code",
 265 |    "execution_count": null,
 266 |    "metadata": {},
 267 |    "outputs": [],
 268 |    "source": [
 269 |     "input_ids = [token2idx[token] for token in tokenized_text]\n",
 270 |     "print(input_ids)"
 271 |    ]
 272 |   },
 273 |   {
 274 |    "cell_type": "code",
 275 |    "execution_count": null,
 276 |    "metadata": {},
 277 |    "outputs": [],
 278 |    "source": [
 279 |     "categorical_df = pd.DataFrame(\n",
 280 |     "    {\"Name\": [\"Bumblebee\", \"Optimus Prime\", \"Megatron\"], \"Label ID\": [0,1,2]})\n",
 281 |     "categorical_df"
 282 |    ]
 283 |   },
 284 |   {
 285 |    "cell_type": "code",
 286 |    "execution_count": null,
 287 |    "metadata": {},
 288 |    "outputs": [],
 289 |    "source": [
 290 |     "pd.get_dummies(categorical_df[\"Name\"])"
 291 |    ]
 292 |   },
 293 |   {
 294 |    "cell_type": "code",
 295 |    "execution_count": null,
 296 |    "metadata": {},
 297 |    "outputs": [],
 298 |    "source": [
 299 |     "import torch\n",
 300 |     "import torch.nn.functional as F\n",
 301 |     "\n",
 302 |     "input_ids = torch.tensor(input_ids)\n",
 303 |     "one_hot_encodings = F.one_hot(input_ids, num_classes=len(token2idx))\n",
 304 |     "one_hot_encodings.shape"
 305 |    ]
 306 |   },
 307 |   {
 308 |    "cell_type": "code",
 309 |    "execution_count": null,
 310 |    "metadata": {},
 311 |    "outputs": [],
 312 |    "source": [
 313 |     "print(f\"Token: {tokenized_text[0]}\")\n",
 314 |     "print(f\"Tensor index: {input_ids[0]}\")\n",
 315 |     "print(f\"One-hot: {one_hot_encodings[0]}\")"
 316 |    ]
 317 |   },
 318 |   {
 319 |    "cell_type": "markdown",
 320 |    "metadata": {},
 321 |    "source": [
 322 |     "### Word Tokenization"
 323 |    ]
 324 |   },
 325 |   {
 326 |    "cell_type": "code",
 327 |    "execution_count": null,
 328 |    "metadata": {},
 329 |    "outputs": [],
 330 |    "source": [
 331 |     "tokenized_text = text.split()\n",
 332 |     "print(tokenized_text)"
 333 |    ]
 334 |   },
 335 |   {
 336 |    "cell_type": "markdown",
 337 |    "metadata": {},
 338 |    "source": [
 339 |     "### Subword Tokenization"
 340 |    ]
 341 |   },
 342 |   {
 343 |    "cell_type": "code",
 344 |    "execution_count": null,
 345 |    "metadata": {},
 346 |    "outputs": [],
 347 |    "source": [
 348 |     "# hide_output\n",
 349 |     "from transformers import AutoTokenizer\n",
 350 |     "\n",
 351 |     "model_ckpt = \"distilbert-base-uncased\"\n",
 352 |     "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)"
 353 |    ]
 354 |   },
 355 |   {
 356 |    "cell_type": "code",
 357 |    "execution_count": null,
 358 |    "metadata": {},
 359 |    "outputs": [],
 360 |    "source": [
 361 |     "encoded_text = tokenizer(text)\n",
 362 |     "print(encoded_text)"
 363 |    ]
 364 |   },
 365 |   {
 366 |    "cell_type": "code",
 367 |    "execution_count": null,
 368 |    "metadata": {},
 369 |    "outputs": [],
 370 |    "source": [
 371 |     "tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)\n",
 372 |     "print(tokens)"
 373 |    ]
 374 |   },
 375 |   {
 376 |    "cell_type": "code",
 377 |    "execution_count": null,
 378 |    "metadata": {},
 379 |    "outputs": [],
 380 |    "source": [
 381 |     "print(tokenizer.convert_tokens_to_string(tokens))"
 382 |    ]
 383 |   },
 384 |   {
 385 |    "cell_type": "code",
 386 |    "execution_count": null,
 387 |    "metadata": {},
 388 |    "outputs": [],
 389 |    "source": [
 390 |     "tokenizer.vocab_size"
 391 |    ]
 392 |   },
 393 |   {
 394 |    "cell_type": "code",
 395 |    "execution_count": null,
 396 |    "metadata": {},
 397 |    "outputs": [],
 398 |    "source": [
 399 |     "tokenizer.model_max_length"
 400 |    ]
 401 |   },
 402 |   {
 403 |    "cell_type": "code",
 404 |    "execution_count": null,
 405 |    "metadata": {},
 406 |    "outputs": [],
 407 |    "source": [
 408 |     "tokenizer.model_input_names"
 409 |    ]
 410 |   },
 411 |   {
 412 |    "cell_type": "markdown",
 413 |    "metadata": {},
 414 |    "source": [
 415 |     "### Tokenizing the Whole Dataset"
 416 |    ]
 417 |   },
 418 |   {
 419 |    "cell_type": "code",
 420 |    "execution_count": null,
 421 |    "metadata": {},
 422 |    "outputs": [],
 423 |    "source": [
 424 |     "def tokenize(batch):\n",
 425 |     "    return tokenizer(batch[\"text\"], padding=True, truncation=True)"
 426 |    ]
 427 |   },
 428 |   {
 429 |    "cell_type": "code",
 430 |    "execution_count": null,
 431 |    "metadata": {},
 432 |    "outputs": [],
 433 |    "source": [
 434 |     "print(tokenize(emotions[\"train\"][:2]))"
 435 |    ]
 436 |   },
 437 |   {
 438 |    "cell_type": "code",
 439 |    "execution_count": null,
 440 |    "metadata": {},
 441 |    "outputs": [],
 442 |    "source": [
 443 |     "#hide_input\n",
 444 |     "tokens2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))\n",
 445 |     "data = sorted(tokens2ids, key=lambda x : x[-1])\n",
 446 |     "df = pd.DataFrame(data, columns=[\"Special Token\", \"Special Token ID\"])\n",
 447 |     "df.T"
 448 |    ]
 449 |   },
 450 |   {
 451 |    "cell_type": "code",
 452 |    "execution_count": null,
 453 |    "metadata": {},
 454 |    "outputs": [],
 455 |    "source": [
 456 |     "# hide_output\n",
 457 |     "emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)"
 458 |    ]
 459 |   },
 460 |   {
 461 |    "cell_type": "code",
 462 |    "execution_count": null,
 463 |    "metadata": {},
 464 |    "outputs": [],
 465 |    "source": [
 466 |     "print(emotions_encoded[\"train\"].column_names)"
 467 |    ]
 468 |   },
 469 |   {
 470 |    "cell_type": "markdown",
 471 |    "metadata": {},
 472 |    "source": [
 473 |     "## Training a Text Classifier"
 474 |    ]
 475 |   },
 476 |   {
 477 |    "cell_type": "markdown",
 478 |    "metadata": {},
 479 |    "source": [
 480 |     "### Transformers as Feature Extractors"
 481 |    ]
 482 |   },
 483 |   {
 484 |    "cell_type": "markdown",
 485 |    "metadata": {},
 486 |    "source": [
 487 |     "#### Using pretrained models"
 488 |    ]
 489 |   },
 490 |   {
 491 |    "cell_type": "code",
 492 |    "execution_count": null,
 493 |    "metadata": {},
 494 |    "outputs": [],
 495 |    "source": [
 496 |     "# hide_output\n",
 497 |     "from transformers import AutoModel\n",
 498 |     "\n",
 499 |     "model_ckpt = \"distilbert-base-uncased\"\n",
 500 |     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
 501 |     "model = AutoModel.from_pretrained(model_ckpt).to(device)"
 502 |    ]
 503 |   },
 504 |   {
 505 |    "cell_type": "markdown",
 506 |    "metadata": {},
 507 |    "source": [
 508 |     "#### Extracting the last hidden states"
 509 |    ]
 510 |   },
 511 |   {
 512 |    "cell_type": "code",
 513 |    "execution_count": null,
 514 |    "metadata": {},
 515 |    "outputs": [],
 516 |    "source": [
 517 |     "text = \"this is a test\"\n",
 518 |     "inputs = tokenizer(text, return_tensors=\"pt\")\n",
 519 |     "print(f\"Input tensor shape: {inputs['input_ids'].size()}\")"
 520 |    ]
 521 |   },
 522 |   {
 523 |    "cell_type": "code",
 524 |    "execution_count": null,
 525 |    "metadata": {},
 526 |    "outputs": [],
 527 |    "source": [
 528 |     "inputs = {k:v.to(device) for k,v in inputs.items()}\n",
 529 |     "with torch.no_grad():\n",
 530 |     "    outputs = model(**inputs)\n",
 531 |     "print(outputs)"
 532 |    ]
 533 |   },
 534 |   {
 535 |    "cell_type": "code",
 536 |    "execution_count": null,
 537 |    "metadata": {},
 538 |    "outputs": [],
 539 |    "source": [
 540 |     "outputs.last_hidden_state.size()"
 541 |    ]
 542 |   },
 543 |   {
 544 |    "cell_type": "code",
 545 |    "execution_count": null,
 546 |    "metadata": {},
 547 |    "outputs": [],
 548 |    "source": [
 549 |     "outputs.last_hidden_state[:,0].size()"
 550 |    ]
 551 |   },
 552 |   {
 553 |    "cell_type": "code",
 554 |    "execution_count": null,
 555 |    "metadata": {},
 556 |    "outputs": [],
 557 |    "source": [
 558 |     "def extract_hidden_states(batch):\n",
 559 |     "    # Place model inputs on the GPU\n",
 560 |     "    inputs = {k:v.to(device) for k,v in batch.items() \n",
 561 |     "              if k in tokenizer.model_input_names}\n",
 562 |     "    # Extract last hidden states\n",
 563 |     "    with torch.no_grad():\n",
 564 |     "        last_hidden_state = model(**inputs).last_hidden_state\n",
 565 |     "    # Return vector for [CLS] token\n",
 566 |     "    return {\"hidden_state\": last_hidden_state[:,0].cpu().numpy()}"
 567 |    ]
 568 |   },
 569 |   {
 570 |    "cell_type": "code",
 571 |    "execution_count": null,
 572 |    "metadata": {},
 573 |    "outputs": [],
 574 |    "source": [
 575 |     "emotions_encoded.set_format(\"torch\", \n",
 576 |     "                            columns=[\"input_ids\", \"attention_mask\", \"label\"])"
 577 |    ]
 578 |   },
 579 |   {
 580 |    "cell_type": "code",
 581 |    "execution_count": null,
 582 |    "metadata": {},
 583 |    "outputs": [],
 584 |    "source": [
 585 |     "#hide_output\n",
 586 |     "emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)"
 587 |    ]
 588 |   },
 589 |   {
 590 |    "cell_type": "code",
 591 |    "execution_count": null,
 592 |    "metadata": {},
 593 |    "outputs": [],
 594 |    "source": [
 595 |     "emotions_hidden[\"train\"].column_names"
 596 |    ]
 597 |   },
 598 |   {
 599 |    "cell_type": "markdown",
 600 |    "metadata": {},
 601 |    "source": [
 602 |     "#### Creating a feature matrix"
 603 |    ]
 604 |   },
 605 |   {
 606 |    "cell_type": "code",
 607 |    "execution_count": null,
 608 |    "metadata": {},
 609 |    "outputs": [],
 610 |    "source": [
 611 |     "import numpy as np\n",
 612 |     "\n",
 613 |     "X_train = np.array(emotions_hidden[\"train\"][\"hidden_state\"])\n",
 614 |     "X_valid = np.array(emotions_hidden[\"validation\"][\"hidden_state\"])\n",
 615 |     "y_train = np.array(emotions_hidden[\"train\"][\"label\"])\n",
 616 |     "y_valid = np.array(emotions_hidden[\"validation\"][\"label\"])\n",
 617 |     "X_train.shape, X_valid.shape"
 618 |    ]
 619 |   },
 620 |   {
 621 |    "cell_type": "markdown",
 622 |    "metadata": {},
 623 |    "source": [
 624 |     "#### Visualizing the training set"
 625 |    ]
 626 |   },
 627 |   {
 628 |    "cell_type": "code",
 629 |    "execution_count": null,
 630 |    "metadata": {},
 631 |    "outputs": [],
 632 |    "source": [
 633 |     "from umap import UMAP\n",
 634 |     "from sklearn.preprocessing import MinMaxScaler\n",
 635 |     "\n",
 636 |     "# Scale features to [0,1] range\n",
 637 |     "X_scaled = MinMaxScaler().fit_transform(X_train)\n",
 638 |     "# Initialize and fit UMAP\n",
 639 |     "mapper = UMAP(n_components=2, metric=\"cosine\").fit(X_scaled)\n",
 640 |     "# Create a DataFrame of 2D embeddings\n",
 641 |     "df_emb = pd.DataFrame(mapper.embedding_, columns=[\"X\", \"Y\"])\n",
 642 |     "df_emb[\"label\"] = y_train\n",
 643 |     "df_emb.head()"
 644 |    ]
 645 |   },
 646 |   {
 647 |    "cell_type": "code",
 648 |    "execution_count": null,
 649 |    "metadata": {},
 650 |    "outputs": [],
 651 |    "source": [
 652 |     "fig, axes = plt.subplots(2, 3, figsize=(7,5))\n",
 653 |     "axes = axes.flatten()\n",
 654 |     "cmaps = [\"Greys\", \"Blues\", \"Oranges\", \"Reds\", \"Purples\", \"Greens\"]\n",
 655 |     "labels = emotions[\"train\"].features[\"label\"].names\n",
 656 |     "\n",
 657 |     "for i, (label, cmap) in enumerate(zip(labels, cmaps)):\n",
 658 |     "    df_emb_sub = df_emb.query(f\"label == {i}\")\n",
 659 |     "    axes[i].hexbin(df_emb_sub[\"X\"], df_emb_sub[\"Y\"], cmap=cmap,\n",
 660 |     "                   gridsize=20, linewidths=(0,))\n",
 661 |     "    axes[i].set_title(label)\n",
 662 |     "    axes[i].set_xticks([]), axes[i].set_yticks([])\n",
 663 |     "\n",
 664 |     "plt.tight_layout()\n",
 665 |     "plt.show()"
 666 |    ]
 667 |   },
 668 |   {
 669 |    "cell_type": "markdown",
 670 |    "metadata": {},
 671 |    "source": [
 672 |     "#### Training a simple classifier\n"
 673 |    ]
 674 |   },
 675 |   {
 676 |    "cell_type": "code",
 677 |    "execution_count": null,
 678 |    "metadata": {},
 679 |    "outputs": [],
 680 |    "source": [
 681 |     "#hide_output\n",
 682 |     "# We increase `max_iter` to guarantee convergence \n",
 683 |     "from sklearn.linear_model import LogisticRegression\n",
 684 |     "\n",
 685 |     "lr_clf = LogisticRegression(max_iter=3000)\n",
 686 |     "lr_clf.fit(X_train, y_train)"
 687 |    ]
 688 |   },
 689 |   {
 690 |    "cell_type": "code",
 691 |    "execution_count": null,
 692 |    "metadata": {},
 693 |    "outputs": [],
 694 |    "source": [
 695 |     "lr_clf.score(X_valid, y_valid)"
 696 |    ]
 697 |   },
 698 |   {
 699 |    "cell_type": "code",
 700 |    "execution_count": null,
 701 |    "metadata": {},
 702 |    "outputs": [],
 703 |    "source": [
 704 |     "from sklearn.dummy import DummyClassifier\n",
 705 |     "\n",
 706 |     "dummy_clf = DummyClassifier(strategy=\"most_frequent\")\n",
 707 |     "dummy_clf.fit(X_train, y_train)\n",
 708 |     "dummy_clf.score(X_valid, y_valid)"
 709 |    ]
 710 |   },
 711 |   {
 712 |    "cell_type": "code",
 713 |    "execution_count": null,
 714 |    "metadata": {},
 715 |    "outputs": [],
 716 |    "source": [
 717 |     "from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix\n",
 718 |     "\n",
 719 |     "def plot_confusion_matrix(y_preds, y_true, labels):\n",
 720 |     "    cm = confusion_matrix(y_true, y_preds, normalize=\"true\")\n",
 721 |     "    fig, ax = plt.subplots(figsize=(6, 6))\n",
 722 |     "    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)\n",
 723 |     "    disp.plot(cmap=\"Blues\", values_format=\".2f\", ax=ax, colorbar=False)\n",
 724 |     "    plt.title(\"Normalized confusion matrix\")\n",
 725 |     "    plt.show()\n",
 726 |     "    \n",
 727 |     "y_preds = lr_clf.predict(X_valid)\n",
 728 |     "plot_confusion_matrix(y_preds, y_valid, labels)"
 729 |    ]
 730 |   },
 731 |   {
 732 |    "cell_type": "markdown",
 733 |    "metadata": {},
 734 |    "source": [
 735 |     "### Fine-Tuning Transformers"
 736 |    ]
 737 |   },
 738 |   {
 739 |    "cell_type": "markdown",
 740 |    "metadata": {},
 741 |    "source": [
 742 |     "#### Loading a pretrained model"
 743 |    ]
 744 |   },
 745 |   {
 746 |    "cell_type": "code",
 747 |    "execution_count": null,
 748 |    "metadata": {},
 749 |    "outputs": [],
 750 |    "source": [
 751 |     "# hide_output\n",
 752 |     "from transformers import AutoModelForSequenceClassification\n",
 753 |     "\n",
 754 |     "num_labels = 6\n",
 755 |     "model = (AutoModelForSequenceClassification\n",
 756 |     "         .from_pretrained(model_ckpt, num_labels=num_labels)\n",
 757 |     "         .to(device))"
 758 |    ]
 759 |   },
 760 |   {
 761 |    "cell_type": "markdown",
 762 |    "metadata": {},
 763 |    "source": [
 764 |     "#### Defining the performance metrics"
 765 |    ]
 766 |   },
 767 |   {
 768 |    "cell_type": "code",
 769 |    "execution_count": null,
 770 |    "metadata": {},
 771 |    "outputs": [],
 772 |    "source": [
 773 |     "from sklearn.metrics import accuracy_score, f1_score\n",
 774 |     "\n",
 775 |     "def compute_metrics(pred):\n",
 776 |     "    labels = pred.label_ids\n",
 777 |     "    preds = pred.predictions.argmax(-1)\n",
 778 |     "    f1 = f1_score(labels, preds, average=\"weighted\")\n",
 779 |     "    acc = accuracy_score(labels, preds)\n",
 780 |     "    return {\"accuracy\": acc, \"f1\": f1}"
 781 |    ]
 782 |   },
 783 |   {
 784 |    "cell_type": "markdown",
 785 |    "metadata": {},
 786 |    "source": [
 787 |     "#### Training the model"
 788 |    ]
 789 |   },
 790 |   {
 791 |    "cell_type": "code",
 792 |    "execution_count": null,
 793 |    "metadata": {},
 794 |    "outputs": [],
 795 |    "source": [
 796 |     "from huggingface_hub import notebook_login\n",
 797 |     "\n",
 798 |     "notebook_login()"
 799 |    ]
 800 |   },
 801 |   {
 802 |    "cell_type": "code",
 803 |    "execution_count": null,
 804 |    "metadata": {},
 805 |    "outputs": [],
 806 |    "source": [
 807 |     "from huggingface_hub import HfFolder\n",
 808 |     "\n",
 809 |     "username  = 'simonmesserli' #replace with your own username from hugging face.\n",
 810 |     "hub_token = HfFolder.get_token()"
 811 |    ]
 812 |   },
 813 |   {
 814 |    "cell_type": "markdown",
 815 |    "metadata": {},
 816 |    "source": [
 817 |     "### Training with SageMaker"
 818 |    ]
 819 |   },
 820 |   {
 821 |    "cell_type": "code",
 822 |    "execution_count": null,
 823 |    "metadata": {},
 824 |    "outputs": [],
 825 |    "source": [
 826 |     "import sagemaker.huggingface\n",
 827 |     "import sagemaker\n",
 828 |     "\n",
 829 |     "sess = sagemaker.Session()\n",
 830 |     "# sagemaker session bucket -> used for uploading data, models and logs\n",
 831 |     "# sagemaker will automatically create this bucket if it not exists\n",
 832 |     "sagemaker_session_bucket=None\n",
 833 |     "if sagemaker_session_bucket is None and sess is not None:\n",
 834 |     "    # set to default bucket if a bucket name is not given\n",
 835 |     "    sagemaker_session_bucket = sess.default_bucket()\n",
 836 |     "\n",
 837 |     "role = sagemaker.get_execution_role()\n",
 838 |     "sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n",
 839 |     "\n",
 840 |     "print(f\"sagemaker role arn: {role}\")\n",
 841 |     "print(f\"sagemaker bucket: {sess.default_bucket()}\")\n",
 842 |     "print(f\"sagemaker session region: {sess.boto_region_name}\")"
 843 |    ]
 844 |   },
 845 |   {
 846 |    "cell_type": "code",
 847 |    "execution_count": null,
 848 |    "metadata": {},
 849 |    "outputs": [],
 850 |    "source": [
 851 |     "import botocore\n",
 852 |     "from datasets.filesystems import S3FileSystem\n",
 853 |     "\n",
 854 |     "s3 = S3FileSystem()\n",
 855 |     "\n",
 856 |     "s3_prefix = 'samples/datasets/02_classification'\n",
 857 |     "\n",
 858 |     "train_dataset=emotions_encoded[\"train\"]\n",
 859 |     "eval_dataset=emotions_encoded[\"validation\"]\n",
 860 |     "\n",
 861 |     "# save train_dataset to s3\n",
 862 |     "training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'\n",
 863 |     "train_dataset.save_to_disk(training_input_path, fs=s3)\n",
 864 |     "\n",
 865 |     "# save eval_dataset to s3\n",
 866 |     "eval_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/validation'\n",
 867 |     "eval_dataset.save_to_disk(eval_input_path, fs=s3)"
 868 |    ]
 869 |   },
 870 |   {
 871 |    "cell_type": "code",
 872 |    "execution_count": null,
 873 |    "metadata": {
 874 |     "scrolled": true
 875 |    },
 876 |    "outputs": [],
 877 |    "source": [
 878 |     "!pygmentize ./scripts/02_classification_train.py"
 879 |    ]
 880 |   },
 881 |   {
 882 |    "cell_type": "code",
 883 |    "execution_count": null,
 884 |    "metadata": {},
 885 |    "outputs": [],
 886 |    "source": [
 887 |     "from sagemaker.huggingface import HuggingFace\n",
 888 |     "import time\n",
 889 |     "\n",
 890 |     "batch_size = 64\n",
 891 |     "logging_steps = len(emotions_encoded[\"train\"]) // batch_size\n",
 892 |     "model_name = f\"{model_ckpt}-finetuned-emotion\"\n",
 893 |     "\n",
 894 |     "# hyperparameters, which are passed into the training job\n",
 895 |     "hyperparameters={'model_id':model_ckpt,\n",
 896 |     "                 'num_train_epochs':2,\n",
 897 |     "                 'learning_rate':2e-5,\n",
 898 |     "                 'per_device_train_batch_size':batch_size,\n",
 899 |     "                 'per_device_eval_batch_size':batch_size,\n",
 900 |     "                 'learning_rate':2e-5,\n",
 901 |     "                 'weight_decay':0.01,\n",
 902 |     "                 'evaluation_strategy':\"epoch\",\n",
 903 |     "                 'disable_tqdm':False,\n",
 904 |     "                 'logging_steps':logging_steps,\n",
 905 |     "                 'push_to_hub':True,\n",
 906 |     "                 'hub_model_id':username + '/' + model_name,\n",
 907 |     "                 'hub_strategy':\"every_save\",\n",
 908 |     "                 'hub_token':hub_token\n",
 909 |     "                }\n",
 910 |     "\n",
 911 |     "\n"
 912 |    ]
 913 |   },
 914 |   {
 915 |    "cell_type": "code",
 916 |    "execution_count": null,
 917 |    "metadata": {},
 918 |    "outputs": [],
 919 |    "source": [
 920 |     "# define Training Job Name \n",
 921 |     "job_name = f'nlp-book-sagemaker-02classificaton-{time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.localtime())}'\n",
 922 |     "\n",
 923 |     "# create the Estimator\n",
 924 |     "huggingface_estimator = HuggingFace(\n",
 925 |     "    entry_point          = '02_classification_train.py', # fine-tuning script used in training jon\n",
 926 |     "    source_dir           = './scripts',                  # directory where fine-tuning script is stored\n",
 927 |     "    instance_type        = 'ml.p3.2xlarge',              # instances type used for the training job\n",
 928 |     "    instance_count       = 1,                            # the number of instances used for training\n",
 929 |     "    base_job_name        = job_name,                     # the name of the training job\n",
 930 |     "    role                 = role,                         # IAM role used in training job to access AWS ressources, e.g. Amazon S3\n",
 931 |     "    transformers_version = '4.11',                       # the transformers version used in the training job\n",
 932 |     "    pytorch_version      = '1.9',                        # the pytorch_version version used in the training job\n",
 933 |     "    py_version           = 'py38',                       # the python version used in the training job\n",
 934 |     "    hyperparameters      = hyperparameters,              # the hyperparameter used for running the training job\n",
 935 |     ")"
 936 |    ]
 937 |   },
 938 |   {
 939 |    "cell_type": "code",
 940 |    "execution_count": null,
 941 |    "metadata": {
 942 |     "scrolled": true
 943 |    },
 944 |    "outputs": [],
 945 |    "source": [
 946 |     "# define a data input dictonary with our uploaded s3 uris\n",
 947 |     "data = {\n",
 948 |     "    'train': training_input_path,\n",
 949 |     "    'test': eval_input_path\n",
 950 |     "}\n",
 951 |     "\n",
 952 |     "# starting the train job with our uploaded datasets as input\n",
 953 |     "huggingface_estimator.fit(data, wait=True)"
 954 |    ]
 955 |   },
 956 |   {
 957 |    "cell_type": "markdown",
 958 |    "metadata": {},
 959 |    "source": [
 960 |     "The logs can be found in Amazon CloudWatch: https://console.aws.amazon.com/cloudwatch/home#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FTrainingJobs"
 961 |    ]
 962 |   },
 963 |   {
 964 |    "cell_type": "code",
 965 |    "execution_count": null,
 966 |    "metadata": {},
 967 |    "outputs": [],
 968 |    "source": [
 969 |     "# the model is saved in the S3 bucket and was also pushed to the hugging face hub.\n",
 970 |     "print(huggingface_estimator.model_data)"
 971 |    ]
 972 |   },
 973 |   {
 974 |    "cell_type": "code",
 975 |    "execution_count": null,
 976 |    "metadata": {},
 977 |    "outputs": [],
 978 |    "source": [
 979 |     "from transformers import Trainer, AutoModel\n",
 980 |     "\n",
 981 |     "# we load the model from the hub to the trainer and do further analyses.\n",
 982 |     "\n",
 983 |     "model_finetuned = AutoModelForSequenceClassification.from_pretrained('simonmesserli' + '/' + model_name)\n",
 984 |     "\n",
 985 |     "trainer = Trainer(model = model_finetuned)"
 986 |    ]
 987 |   },
 988 |   {
 989 |    "cell_type": "markdown",
 990 |    "metadata": {},
 991 |    "source": [
 992 |     "### Deploy model with SageMaker Endpoint"
 993 |    ]
 994 |   },
 995 |   {
 996 |    "cell_type": "code",
 997 |    "execution_count": null,
 998 |    "metadata": {
 999 |     "scrolled": true
1000 |    },
1001 |    "outputs": [],
1002 |    "source": [
1003 |     "predictor = huggingface_estimator.deploy(1,\"ml.g4dn.xlarge\")"
1004 |    ]
1005 |   },
1006 |   {
1007 |    "cell_type": "code",
1008 |    "execution_count": null,
1009 |    "metadata": {},
1010 |    "outputs": [],
1011 |    "source": [
1012 |     "custom_tweet = {\"inputs\" : \"I saw a movie today and it was really good.\"}\n",
1013 |     "predictor.predict(custom_tweet)"
1014 |    ]
1015 |   },
1016 |   {
1017 |    "cell_type": "markdown",
1018 |    "metadata": {},
1019 |    "source": [
1020 |     "After running your requests, make sure to delete your endpoint."
1021 |    ]
1022 |   },
1023 |   {
1024 |    "cell_type": "code",
1025 |    "execution_count": null,
1026 |    "metadata": {},
1027 |    "outputs": [],
1028 |    "source": [
1029 |     "predictor.delete_endpoint()"
1030 |    ]
1031 |   },
1032 |   {
1033 |    "cell_type": "code",
1034 |    "execution_count": null,
1035 |    "metadata": {},
1036 |    "outputs": [],
1037 |    "source": [
1038 |     "# hide_output\n",
1039 |     "preds_output = trainer.predict(emotions_encoded[\"validation\"])"
1040 |    ]
1041 |   },
1042 |   {
1043 |    "cell_type": "code",
1044 |    "execution_count": null,
1045 |    "metadata": {},
1046 |    "outputs": [],
1047 |    "source": [
1048 |     "preds_output.metrics"
1049 |    ]
1050 |   },
1051 |   {
1052 |    "cell_type": "code",
1053 |    "execution_count": null,
1054 |    "metadata": {},
1055 |    "outputs": [],
1056 |    "source": [
1057 |     "y_preds = np.argmax(preds_output.predictions, axis=1)"
1058 |    ]
1059 |   },
1060 |   {
1061 |    "cell_type": "code",
1062 |    "execution_count": null,
1063 |    "metadata": {},
1064 |    "outputs": [],
1065 |    "source": [
1066 |     "plot_confusion_matrix(y_preds, y_valid, labels)"
1067 |    ]
1068 |   },
1069 |   {
1070 |    "cell_type": "markdown",
1071 |    "metadata": {},
1072 |    "source": [
1073 |     "#### Error analysis"
1074 |    ]
1075 |   },
1076 |   {
1077 |    "cell_type": "code",
1078 |    "execution_count": null,
1079 |    "metadata": {},
1080 |    "outputs": [],
1081 |    "source": [
1082 |     "from torch.nn.functional import cross_entropy\n",
1083 |     "\n",
1084 |     "def forward_pass_with_label(batch):\n",
1085 |     "    # Place all input tensors on the same device as the model\n",
1086 |     "    inputs = {k:v.to(device) for k,v in batch.items() \n",
1087 |     "              if k in tokenizer.model_input_names}\n",
1088 |     "\n",
1089 |     "    with torch.no_grad():\n",
1090 |     "        output = model(**inputs)\n",
1091 |     "        pred_label = torch.argmax(output.logits, axis=-1)\n",
1092 |     "        loss = cross_entropy(output.logits, batch[\"label\"].to(device), \n",
1093 |     "                             reduction=\"none\")\n",
1094 |     "\n",
1095 |     "    # Place outputs on CPU for compatibility with other dataset columns   \n",
1096 |     "    return {\"loss\": loss.cpu().numpy(), \n",
1097 |     "            \"predicted_label\": pred_label.cpu().numpy()}"
1098 |    ]
1099 |   },
1100 |   {
1101 |    "cell_type": "code",
1102 |    "execution_count": null,
1103 |    "metadata": {},
1104 |    "outputs": [],
1105 |    "source": [
1106 |     "#hide_output\n",
1107 |     "# Convert our dataset back to PyTorch tensors\n",
1108 |     "emotions_encoded.set_format(\"torch\", \n",
1109 |     "                            columns=[\"input_ids\", \"attention_mask\", \"label\"])\n",
1110 |     "# Compute loss values\n",
1111 |     "emotions_encoded[\"validation\"] = emotions_encoded[\"validation\"].map(\n",
1112 |     "    forward_pass_with_label, batched=True, batch_size=16)"
1113 |    ]
1114 |   },
1115 |   {
1116 |    "cell_type": "code",
1117 |    "execution_count": null,
1118 |    "metadata": {},
1119 |    "outputs": [],
1120 |    "source": [
1121 |     "emotions_encoded.set_format(\"pandas\")\n",
1122 |     "cols = [\"text\", \"label\", \"predicted_label\", \"loss\"]\n",
1123 |     "df_test = emotions_encoded[\"validation\"][:][cols]\n",
1124 |     "df_test[\"label\"] = df_test[\"label\"].apply(label_int2str)\n",
1125 |     "df_test[\"predicted_label\"] = (df_test[\"predicted_label\"]\n",
1126 |     "                              .apply(label_int2str))"
1127 |    ]
1128 |   },
1129 |   {
1130 |    "cell_type": "code",
1131 |    "execution_count": null,
1132 |    "metadata": {},
1133 |    "outputs": [],
1134 |    "source": [
1135 |     "#hide_output\n",
1136 |     "df_test.sort_values(\"loss\", ascending=False).head(10)"
1137 |    ]
1138 |   },
1139 |   {
1140 |    "cell_type": "code",
1141 |    "execution_count": null,
1142 |    "metadata": {},
1143 |    "outputs": [],
1144 |    "source": [
1145 |     "#hide_output\n",
1146 |     "df_test.sort_values(\"loss\", ascending=True).head(10)"
1147 |    ]
1148 |   },
1149 |   {
1150 |    "cell_type": "markdown",
1151 |    "metadata": {},
1152 |    "source": [
1153 |     "#### Saving and sharing the model"
1154 |    ]
1155 |   },
1156 |   {
1157 |    "cell_type": "code",
1158 |    "execution_count": null,
1159 |    "metadata": {},
1160 |    "outputs": [],
1161 |    "source": [
1162 |     "#hide_output\n",
1163 |     "from transformers import pipeline\n",
1164 |     "\n",
1165 |     "# Change `simonmesserli` to your Hub username\n",
1166 |     "model_id = \"simonmesserli/distilbert-base-uncased-finetuned-emotion\"\n",
1167 |     "classifier = pipeline(\"text-classification\", model=model_id)"
1168 |    ]
1169 |   },
1170 |   {
1171 |    "cell_type": "code",
1172 |    "execution_count": null,
1173 |    "metadata": {},
1174 |    "outputs": [],
1175 |    "source": [
1176 |     "custom_tweet = \"I saw a movie today and it was really good.\"\n",
1177 |     "preds = classifier(custom_tweet, return_all_scores=True)"
1178 |    ]
1179 |   },
1180 |   {
1181 |    "cell_type": "code",
1182 |    "execution_count": null,
1183 |    "metadata": {},
1184 |    "outputs": [],
1185 |    "source": [
1186 |     "preds_df = pd.DataFrame(preds[0])\n",
1187 |     "plt.bar(labels, 100 * preds_df[\"score\"], color='C0')\n",
1188 |     "plt.title(f'\"{custom_tweet}\"')\n",
1189 |     "plt.ylabel(\"Class probability (%)\")\n",
1190 |     "plt.show()"
1191 |    ]
1192 |   },
1193 |   {
1194 |    "cell_type": "markdown",
1195 |    "metadata": {},
1196 |    "source": [
1197 |     "## Conclusion"
1198 |    ]
1199 |   }
1200 |  ],
1201 |  "metadata": {
1202 |   "kernelspec": {
1203 |    "display_name": "Python 3.9.13 64-bit",
1204 |    "language": "python",
1205 |    "name": "python3"
1206 |   },
1207 |   "language_info": {
1208 |    "codemirror_mode": {
1209 |     "name": "ipython",
1210 |     "version": 3
1211 |    },
1212 |    "file_extension": ".py",
1213 |    "mimetype": "text/x-python",
1214 |    "name": "python",
1215 |    "nbconvert_exporter": "python",
1216 |    "pygments_lexer": "ipython3",
1217 |    "version": "3.9.13"
1218 |   },
1219 |   "vscode": {
1220 |    "interpreter": {
1221 |     "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
1222 |    }
1223 |   }
1224 |  },
1225 |  "nbformat": 4,
1226 |  "nbformat_minor": 4
1227 | }
1228 | 


--------------------------------------------------------------------------------
/SageMaker/README.md:
--------------------------------------------------------------------------------
 1 | # Run the Hugging Face notebooks on SageMaker 
 2 | 
 3 | 1/
 4 | Open Amazon SageMaker Notebook Instances in your preferred AWS region:
 5 | https://console.aws.amazon.com/sagemaker/home#/notebook-instances 
 6 | 
 7 | 2/
 8 | Click **Create notebook instance**.
 9 | 
10 | 3/
11 | **Choose an instance type:**
12 | Any ml.t3.* instance or if you want to have an instance with GPU take e.g. ml.g4dn.xlarge. \
13 | Overview: https://aws.amazon.com/sagemaker/pricing/
14 | 
15 | <img alt="notebook-config" height=300 src="images/notebook_config.png" id="notebook-config"/>
16 | 
17 | 4/
18 | Choose **notebook-al2-v1** and add more storage volume, e.g. 50 GB.
19 | 
20 | 5/ (optional)
21 | If you plan to use JupyterLab, make sure to add a Lifecycle configuration with the following code.
22 | Or execute the code in a terminal in JupyterLab.
23 | 
24 | `
25 | sudo -u ec2-user -i <<'EOF'
26 | EXTENSION_NAME=@jupyter-widgets/jupyterlab-manager 
27 | source /home/ec2-user/anaconda3/bin/activate JupyterSystemEnv
28 | jupyter labextension install $EXTENSION_NAME
29 | source /home/ec2-user/anaconda3/bin/deactivate
30 | EOF
31 | `
32 | 
33 | 5/
34 | Create a new IAM role, which will used in the notebooks to access AWS resources.
35 | 
36 | <img alt="iam-role" height=400 src="images/iam_role.png" id="iam-role"/>
37 | 
38 | 6/
39 | Add this Git repository, which will be cloned to your notebook by selecting *Clone a public Git repository to this notebook instance only*.
40 | Paste the Git repository URL to the next field.
41 | 
42 | <img alt="git-repo" height=250 src="images/git_repo.png" id="git-repo"/>
43 | 
44 | 7/
45 | The notebook instance will now created and after you click **Open Jupyter** you see the cloned notebooks. 
46 | Navigate to the folder *SageMaker*, open the first chapter and execute one cell after the other.


--------------------------------------------------------------------------------
/SageMaker/images/git_repo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/SageMaker/images/git_repo.png


--------------------------------------------------------------------------------
/SageMaker/images/iam_role.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/SageMaker/images/iam_role.png


--------------------------------------------------------------------------------
/SageMaker/images/notebook_config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/SageMaker/images/notebook_config.png


--------------------------------------------------------------------------------
/SageMaker/scripts/02_classification_train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | import random
  5 | import sys
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from datasets import load_from_disk, load_metric
 10 | from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
 11 | from transformers.trainer_utils import get_last_checkpoint
 12 | 
 13 | if __name__ == "__main__":
 14 | 
 15 |     parser = argparse.ArgumentParser()
 16 | 
 17 |     # hyperparameters sent by the client are passed as command-line arguments to the script.
 18 |     parser.add_argument("--model_id", type=str)
 19 |     parser.add_argument("--num_train_epochs", type=int, default=2)
 20 |     parser.add_argument("--learning_rate", type=str, default=2e-5)
 21 |     parser.add_argument("--per_device_train_batch_size", type=int, default=64)
 22 |     parser.add_argument("--per_device_eval_batch_size", type=int, default=64)
 23 |     parser.add_argument("--weight_decay", type=float, default=0.01)
 24 |     parser.add_argument("--evaluation_strategy", type=str, default="epoch")
 25 |     parser.add_argument("--disable_tqdm", type=bool, default=False)
 26 |     parser.add_argument("--logging_steps", type=int, default=100) 
 27 |     
 28 |     # Push to Hub Parameters
 29 |     parser.add_argument("--push_to_hub", type=bool, default=True)
 30 |     parser.add_argument("--hub_model_id", type=str, default=None)
 31 |     parser.add_argument("--hub_strategy", type=str, default=None)
 32 |     parser.add_argument("--hub_token", type=str, default=None)
 33 |     
 34 |     #check with L.
 35 |     # parser.add_argument("--warmup_steps", type=int, default=500) #check with L.
 36 |     # parser.add_argument("--fp16", type=bool, default=True) #check with L.
 37 | 
 38 |     # Data, model, and output directories
 39 |     parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
 40 |     parser.add_argument("--output_dir", type=str, default=os.environ["SM_MODEL_DIR"])
 41 |     parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
 42 |     parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
 43 |     parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
 44 | 
 45 |     args, _ = parser.parse_known_args()
 46 |     
 47 |     # make sure we have required parameters to push
 48 |     if args.push_to_hub:
 49 |         if args.hub_strategy is None:
 50 |             raise ValueError("--hub_strategy is required when pushing to Hub")
 51 |         if args.hub_token is None:
 52 |             raise ValueError("--hub_token is required when pushing to Hub")
 53 | 
 54 |     # sets hub id if not provided
 55 |     if args.hub_model_id is None:
 56 |         args.hub_model_id = args.model_id.replace("/", "--")
 57 | 
 58 |     # Set up logging
 59 |     logger = logging.getLogger(__name__)
 60 | 
 61 |     logging.basicConfig(
 62 |         level=logging.getLevelName("INFO"),
 63 |         handlers=[logging.StreamHandler(sys.stdout)],
 64 |         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 65 |     )
 66 | 
 67 |     # load datasets
 68 |     train_dataset = load_from_disk(args.training_dir)
 69 |     test_dataset = load_from_disk(args.test_dir)
 70 | 
 71 |     logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
 72 |     logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
 73 | 
 74 |     metric = load_metric("accuracy")
 75 | 
 76 |     def compute_metrics(eval_pred):
 77 |         predictions, labels = eval_pred
 78 |         predictions = np.argmax(predictions, axis=1)
 79 |         return metric.compute(predictions=predictions, references=labels)
 80 | 
 81 |     # Prepare model labels - useful in inference API
 82 |     labels = train_dataset.features["label"].names
 83 |     num_labels = len(labels)
 84 |     label2id, id2label = dict(), dict()
 85 |     for i, label in enumerate(labels):
 86 |         label2id[label] = str(i)
 87 |         id2label[str(i)] = label
 88 | 
 89 |     # download model from model hub
 90 |     model = AutoModelForSequenceClassification.from_pretrained(
 91 |         args.model_id, num_labels=num_labels, label2id=label2id, id2label=id2label
 92 |     )
 93 |     tokenizer = AutoTokenizer.from_pretrained(args.model_id)
 94 | 
 95 |     # define training args
 96 |     training_args = TrainingArguments(
 97 |         output_dir=args.output_dir,
 98 |         overwrite_output_dir=True if get_last_checkpoint(args.output_dir) is not None else False,
 99 |         num_train_epochs=args.num_train_epochs,
100 |         learning_rate=float(args.learning_rate),
101 |         per_device_train_batch_size=args.per_device_train_batch_size,
102 |         per_device_eval_batch_size=args.per_device_eval_batch_size,
103 |         weight_decay=args.weight_decay,
104 |         evaluation_strategy=args.evaluation_strategy,
105 |         disable_tqdm=args.disable_tqdm,
106 |         logging_steps=args.logging_steps,
107 |         # push to hub parameters
108 |         push_to_hub=args.push_to_hub,
109 |         hub_strategy=args.hub_strategy,
110 |         hub_model_id=args.hub_model_id,
111 |         hub_token=args.hub_token,
112 |         save_strategy="epoch",
113 |         save_total_limit=2,
114 |         logging_dir=f"{args.output_data_dir}/logs",
115 |         load_best_model_at_end=True,
116 |         metric_for_best_model="accuracy"
117 |         
118 |         #warmup_steps=args.warmup_steps,
119 |         #fp16=args.fp16,
120 |     )
121 | 
122 |     # create Trainer instance
123 |     trainer = Trainer(
124 |         model=model,
125 |         args=training_args,
126 |         compute_metrics=compute_metrics,
127 |         train_dataset=train_dataset,
128 |         eval_dataset=test_dataset,
129 |         tokenizer=tokenizer,
130 |     )
131 | 
132 |     # train model
133 |     if get_last_checkpoint(args.output_dir) is not None:
134 |         logger.info("***** continue training *****")
135 |         last_checkpoint = get_last_checkpoint(args.output_dir)
136 |         trainer.train(resume_from_checkpoint=last_checkpoint)
137 |     else:
138 |         trainer.train()
139 | 
140 |     # evaluate model
141 |     eval_result = trainer.evaluate(eval_dataset=test_dataset)
142 | 
143 |     # writes eval result to file which can be accessed later in s3 ouput
144 |     with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer:
145 |         print(f"***** Eval results *****")
146 |         for key, value in sorted(eval_result.items()):
147 |             writer.write(f"{key} = {value}\n")
148 |             print(f"{key} = {value}\n")
149 | 
150 |     # Saves the model to s3 uses os.environ["SM_MODEL_DIR"] to make sure checkpointing works
151 |     trainer.save_model(os.environ["SM_MODEL_DIR"])


--------------------------------------------------------------------------------
/SageMaker/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | from textwrap import TextWrapper
 4 | 
 5 | import datasets
 6 | import huggingface_hub
 7 | import matplotlib.font_manager as font_manager
 8 | import matplotlib.pyplot as plt
 9 | import torch
10 | import transformers
11 | from IPython.display import set_matplotlib_formats
12 | 
13 | # TODO: Consider adding SageMaker StudioLab
14 | is_colab = "google.colab" in sys.modules
15 | is_kaggle = "kaggle_secrets" in sys.modules
16 | is_gpu_available = torch.cuda.is_available()
17 | 
18 | 
19 | def install_mpl_fonts():
20 |     font_dir = ["../orm_fonts/"]
21 |     for font in font_manager.findSystemFonts(font_dir):
22 |         font_manager.fontManager.addfont(font)
23 | 
24 | 
25 | def set_plot_style():
26 |     #install_mpl_fonts()
27 |     set_matplotlib_formats("pdf", "svg")
28 |     #plt.style.use("plotting.mplstyle")
29 |     logging.getLogger("matplotlib").setLevel(level=logging.ERROR)
30 | 
31 | 
32 | def display_library_version(library):
33 |     print(f"Using {library.__name__} v{library.__version__}")
34 | 
35 | 
36 | def setup_chapter():
37 |     # Check if we have a GPU
38 |     if not is_gpu_available:
39 |         print("No GPU was detected! This notebook can be *very* slow without a GPU 🐢")
40 |         if is_colab:
41 |             print("Go to Runtime > Change runtime type and select a GPU hardware accelerator.")
42 |         if is_kaggle:
43 |             print("Go to Settings > Accelerator and select GPU.")
44 |     # Give visibility on versions of the core libraries
45 |     display_library_version(transformers)
46 |     display_library_version(datasets)
47 |     # Disable all info / warning messages
48 |     transformers.logging.set_verbosity_error()
49 |     datasets.logging.set_verbosity_error()
50 |     # Logging is only available for the chapters that don't depend on Haystack
51 |     if huggingface_hub.__version__ == "0.0.19":
52 |         huggingface_hub.logging.set_verbosity_error()
53 |     # Use O'Reilly style for plots
54 |     set_plot_style()
55 | 
56 | 
57 | def wrap_print_text(print):
58 |     """Adapted from: https://stackoverflow.com/questions/27621655/how-to-overload-print-function-to-expand-its-functionality/27621927"""
59 | 
60 |     def wrapped_func(text):
61 |         if not isinstance(text, str):
62 |             text = str(text)
63 |         wrapper = TextWrapper(
64 |             width=80,
65 |             break_long_words=True,
66 |             break_on_hyphens=False,
67 |             replace_whitespace=False,
68 |         )
69 |         return print("\n".join(wrapper.fill(line) for line in text.split("\n")))
70 | 
71 |     return wrapped_func
72 | 
73 | 
74 | print = wrap_print_text(print)
75 | 


--------------------------------------------------------------------------------
/environment-chapter7.yml:
--------------------------------------------------------------------------------
 1 | name: book-chapter7
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 | dependencies:
 6 |   - python=3.9
 7 |   - cudatoolkit=11.0
 8 |   - pip
 9 |   - notebook
10 |   - ipykernel
11 |   - pip:
12 |       - farm-haystack==0.9.0
13 |       - datasets==1.11.0
14 |       - matplotlib
15 |       - ipywidgets


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: book
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 |   - pyg
 6 |   - conda-forge
 7 | dependencies:
 8 |   - python=3.9
 9 |   - cudatoolkit=11.3
10 |   - pytorch-scatter
11 |   - pip
12 |   - notebook
13 |   - ipykernel
14 |   - ipywidgets
15 |   - git-lfs
16 |   - libsndfile
17 |   - pip:
18 |       - -r requirements.txt
19 | 


--------------------------------------------------------------------------------
/images/book_cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/book_cover.jpg


--------------------------------------------------------------------------------
/images/chapter01_enc-dec-attn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_enc-dec-attn.png


--------------------------------------------------------------------------------
/images/chapter01_enc-dec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_enc-dec.png


--------------------------------------------------------------------------------
/images/chapter01_hf-ecosystem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_hf-ecosystem.png


--------------------------------------------------------------------------------
/images/chapter01_hub-model-card.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_hub-model-card.png


--------------------------------------------------------------------------------
/images/chapter01_hub-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_hub-overview.png


--------------------------------------------------------------------------------
/images/chapter01_rnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_rnn.png


--------------------------------------------------------------------------------
/images/chapter01_self-attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_self-attention.png


--------------------------------------------------------------------------------
/images/chapter01_timeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_timeline.png


--------------------------------------------------------------------------------
/images/chapter01_transfer-learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_transfer-learning.png


--------------------------------------------------------------------------------
/images/chapter01_ulmfit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_ulmfit.png


--------------------------------------------------------------------------------
/images/chapter02_attention-alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_attention-alignment.png


--------------------------------------------------------------------------------
/images/chapter02_attention-mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_attention-mask.png


--------------------------------------------------------------------------------
/images/chapter02_encoder-classifier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_encoder-classifier.png


--------------------------------------------------------------------------------
/images/chapter02_encoder-feature-based.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_encoder-feature-based.png


--------------------------------------------------------------------------------
/images/chapter02_encoder-fine-tuning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_encoder-fine-tuning.png


--------------------------------------------------------------------------------
/images/chapter02_hf-libraries.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_hf-libraries.png


--------------------------------------------------------------------------------
/images/chapter02_transformers-compact.html:
--------------------------------------------------------------------------------
 1 | <!--[if IE]><meta http-equiv="X-UA-Compatible" content="IE=5,IE=9" ><![endif]-->
 2 | <!DOCTYPE html>
 3 | <html>
 4 | <head>
 5 | <title>chapter02_transformers-compact.html</title>
 6 | <meta charset="utf-8"/>
 7 | </head>
 8 | <body>
 9 | <div class="mxgraph" style="max-width:100%;border:1px solid transparent;" data-mxgraph="{&quot;highlight&quot;:&quot;#0000ff&quot;,&quot;nav&quot;:true,&quot;resize&quot;:true,&quot;xml&quot;:&quot;&lt;mxfile host=\&quot;Electron\&quot; modified=\&quot;2021-06-23T09:48:13.506Z\&quot; agent=\&quot;5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/13.5.7 Chrome/83.0.4103.122 Electron/9.1.2 Safari/537.36\&quot; etag=\&quot;V6VPVOHjA6xg3aqTY868\&quot; version=\&quot;13.5.7\&quot; type=\&quot;device\&quot;&gt;&lt;diagram id=\&quot;h8wqI9l6w8RMa7Y9n3ze\&quot; name=\&quot;Page-1\&quot;&gt;7Vxbd6I6FP41fXQWd/XRa9t1dE4vzqnzdFaUVNODxIbY6vz6EzQIhKiIIh2Z1QfJTgiw97e/7L0JvdFbs+UtAfNpH9vQudEUe3mjt280TTVNhf34khWXKIq+kUwIsrksFDyjXzAYyKULZEMvNpBi7FA0jwvH2HXhmMZkgBD8GR/2ip34VedgAhOC5zFwktIXZNMpl9Y1Jey4g2gypWLPDASjucCbAht/RkR650ZvEYzp5mi2bEHHV1+gmM153R292zsj0KWSE354kPw9evOVoikOGDHLrAdtTgtmufv8abTfq/hf9+W+ez/pGAvre0Xd3t32qh5dBSoheOHa0D9fvdGbn1NE4fMcjP3eT4YCJpvSmcO7X5HjtLCDyfpcfaTa9qvC5B4l+D8Y6VGVql5nczSTj8Wf9AMSCpcREb/DW4hnkJIVG8J7a4HKA9QF7c/QhFqNy6ZR61lcCDhsJtu5Q8WyA66WoBlRtdxWb48tOAPDzqMx73bJvVlV+h8Vgz8WcBYwapqIqqFrN3wQs5aLXSioFru0C2bI8Z/yDjofkKIx4B3ck1SLtZkiyWrIWkrQ+Ok3vplBs72MdrZXvCXY7hXYVTCW2W5kWKZibHsCR9H3WdPDCzLmj4nGT0b7xfnLfqyDymNbMYBmVQIGoIBMIN2DVq42aMc8OQmOiPFNie0DGYEOoOgj7v8yPPArPGDk0hB7phrHnqULkNo8OD8r6t/CRHVhItMQJtooJjERAwxYRYbN/QHe7huuKfHrBNQbon0zY4j9rU5l7rCbdiyHabo5iiHcel/49LeGbMVbY7bBBqjWfMl+1vhRmNPTCnDQxN10OvCVhmeyo4n/2+w8DYJrsFsaBfI43UkdMT+6S+EyEwd4Hne4M1Bf1YhbU5dQ35YOo/DPifmkrl1PEh1z3mfexIRO8QS7wOmE0mZoBl9T4ZgexnOu/DdI6YoTH1hQHDcNXCI6jBxvOLBq8mbIgX4joMBd1Kkd5M4oOf8DiQ1csB4kEjqTdJGv0bYiJdAEcpo1/y8tte5jzIPMqqdk1tSUuQNIJ4Qq2hX5bk3wXbUq8V1F4ru1s/juCVbQc7OCrYwh1GRWMC1TrcLzW6FeK9QKCQaVKlwWO27X183CNyDA9V4xmUGye12MWMlfZuOmiKuc81XUPlzE1+b2mCmdXUxv+spmoajT4B0zZNtrEpfZPk7sYux6DoOKbqUnDWpI7Kkdb890xjMTxuu4Y5Y6ky9sk1SJxrrN716Snh5PiFUhNFUKtpyVsFwbfnXL5WkgVVGLs5A0lq9JiDGXpHp/YWNvBLctdQQhqfLNqGuxsNSyqvvjUtZ4gAQxpfkY2b/2HU61jZSB3mVS6LpZjxO2Ki6tqXNoYabEGr0jhz6c5qZDY/1ENCbBdHzRRz2p6JMi/kqHOzTEI3VR6z1MH98t/bsxeO9UgoLuufKQy8BT1YQIsSbmzmnhGQI7qPGY6fB5bI1HVYWK6IlFnnToV4siYwG0e8n4MIy/DOyEQp2h1b+ZZwKeZK7M1FhUBXDY62csAGpfpgKYCP7W+nvAHqIIS6PCnjBgGx3uDBvPEPeJVUajJsmR86syFoiwylNWjBl/MHYUxiyR7sqBsWYj83sMrZobxFK8t/0NIVYXISbJXq8QYk/Yf1kGMqJMV3JDWfGvy0qCgEbvhNelen7V/uIBYFqlAEBr8NTLaH4jv2D58i97kklVOQjg9mHgV1WyASC/SLYAAChWKcPMNvIock5YBUzrilYBMde48K6ZokAwMDMa3yp0h+gVZBolAVhf61fWeyUOoUz+WuoPyE5LZ8uxlLFYJiPArjqQKQnFtGHKYsaeTeZXEsWIuWy1HAhookkTETsrAvIjgXKsMiWpmHR6ndbgqZERZdo184xlXBQB6b4x0C72eVWun0l9kc0Hic9KjMx7D7ar0u6pjt718vU3J+wMUvO9bKWIi2qHSVL+/vKaw/ESFZZdiFMskzs3BF4rAkoSKPkIeKtYzYwQuCoSEN8uXRgCyUhJpnG9oECpHLs0zdr5dmlK5jrfBvZZf0GI/ji5baij/o/7u+FzXwt2aP8eQbT8C99sH/ge2iYvBZ0EmIUF7EKUbSrZA3Zxi0RyqhwDdtYM/0XKZnj4r2b0zv8=&lt;/diagram&gt;&lt;/mxfile&gt;&quot;,&quot;toolbar&quot;:&quot;pages zoom layers lightbox&quot;,&quot;page&quot;:0}"></div>
10 | <script type="text/javascript" src="https://app.diagrams.net/js/viewer-static.min.js"></script>
11 | </body>
12 | </html>
13 | 


--------------------------------------------------------------------------------
/images/chapter02_transformers.html:
--------------------------------------------------------------------------------
 1 | <!--[if IE]><meta http-equiv="X-UA-Compatible" content="IE=5,IE=9" ><![endif]-->
 2 | <!DOCTYPE html>
 3 | <html>
 4 | <head>
 5 | <title>chapter02_transformers.html</title>
 6 | <meta charset="utf-8"/>
 7 | </head>
 8 | <body>
 9 | <div class="mxgraph" style="max-width:100%;border:1px solid transparent;" data-mxgraph="{&quot;highlight&quot;:&quot;#0000ff&quot;,&quot;nav&quot;:true,&quot;resize&quot;:true,&quot;xml&quot;:&quot;&lt;mxfile host=\&quot;Electron\&quot; modified=\&quot;2021-06-18T09:05:59.700Z\&quot; agent=\&quot;5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/13.5.7 Chrome/83.0.4103.122 Electron/9.1.2 Safari/537.36\&quot; etag=\&quot;vUn40p3BcQWJIwj8U9ol\&quot; version=\&quot;13.5.7\&quot; type=\&quot;device\&quot;&gt;&lt;diagram id=\&quot;h8wqI9l6w8RMa7Y9n3ze\&quot; name=\&quot;Page-1\&quot;&gt;7V1rc5s6Gv41nul+cIa74WN8Sds5STdNstv20xnFyA4tBh/ASby/fiUsDEjCFhgwSUhnGktgIfTenvciZaBOVq+fA7B+uvFt6A4UyX4dqNOBosiWJaFfuGdLehSF9CwDxyZ9ace98z9IOpPbNo4Nw9yNke+7kbPOd859z4PzKNcHgsB/yd+28N38U9dgCZmO+zlw2d4fjh09JS+WvAa+8AU6y6eIvrICyd2kI3wCtv+S6VJnA3US+H60+7R6nUAXL1+yMLvvXRVc3c8sgF4k8oXf3ydwBX7Ovmvrq6vgqz6Sbp6H2m6UZ+BuyBuTyUbbZAmgZ1/ilUQtz/dQ5/gpWrmoJaOPC9+LrsDKcTGBv0D3GUbOHJALhJyygdpoksH2J2pJSeMXblyM9KQ9fc1enW5Ja+G47sR3/SCejboA9gjMUX8YBf4fmLnyqBm6pO2vJORS8YivThQ/+0LWDdLGjx9KF5Ikk470+bixzTRuYeCsYAQD0seufMKaIFhC0vXl5Zc2/Wfk/+39+Hr1dTnTNsa3IVldaFNcF/qbYE66nPmdNv3h/mV/t8Dw+1TSgGIM5ZRjkLBBH00n2KIvvqRMqRM+e8qwY9IXQBdEznP+oYDIxnI/3P4Jt76DXk2RiCBrEhknEWMp6UjG2L0A+VqWCamRLDk/kq5RA+2WkBkIcSDYZm5b4xvC4hnr+ij3HEXNCQf6sBsxaWUWNe2KBShu/ieEwb8ff2MNo0gueIRuVlgOSpicmSSRyIxwBf7Gs6FNpOnlyYng/RrErPCCVColbKVFYemCMDzGtc8wiOBrpotls2RVlTz11KSdYUNF4/ChrFBkzvJchijZNUfNzLKXIANX8pojw6Ns2wuJRwZZGqkWrGfpTWrpTc7Km5yVV2hJrbbyjDXhaimLNR5I092Tph9ET/7S94A7S3vH6dLjNUzvufb9NVnw3zCKtsSYgE3k58mRKHc5o9gTu1Ks1gvMkXLUHGUN3n9hYAMPxDfRRhL1XDl4Racpb2RtEsMtYxP/23MLYyM47JM1G4fszVG7pPJ5T9hsFDDSCfKqvCO1aWqU+dRY4ZWlTqpNtTEy2NIcQoVHBt3Q5RGsnwyWeV4yMDqUu+I8RG64aIjxI/qwxB8eAuCFCz9YIURKrqHH7S8zShitT5SnRX7NicbKEoh0AddZeqg5R6uO4e8YrzYC+O4lubBybDtW4zzi51U77RHUQVFasCSWojw4UoGeYsTTGeLNvDnyioMO00TIfYvbZPbF9qiEShxRlDPOTDmDodwUdp1yTRJIluTWKHSCd9UcTGjfPskqhfE1Hshv0L0SCxmZHAPVSMjosJ91EEsf97woeH2uyIpJYxLZpEgpGlmhR1LogQoiK8eDH2JsYZ3IFixVy8cW5dNiiwIiX8hPYi5WZ/hOVih2MY2KfJdybBLS08UYr2xIj5nyiTE9MbaWWUjH4PGkA/NljuONfzZ+cmEYxhx7iW6QjfVrejEZZTy7e8jA+d1g+QfkUT6ZRUD3hGvgCU1D4U1jiK7cgPAPMpuKdA285QanSFAfTu443hJ9/HRzffOvzJx2D2x+Wt8w8lKkewyfvDme1G0AbWceOT4e+dO3+9viadWO4ly4iDgYLsJhKwEAlwUPwpquBpShUz6TqnJAhsIDGRUiiYIyxoLvhmTs820lEat3EsPzT0GpMoUGJHqzDpHo7LSKv0D/ZQiUPm8YuwXIiATY0AbA8XbfsEEEQhgVT7BXAoWuBqUFNEtUC8iNaYFzORYU7DvoWBwHgl0BeLQzyWRahQEejbtEc7aNplVzq9t6Qi8rt0xUJ36RWz90YlzCCfdcUzfswz6F8aAaRN56E8GFhPbnK0gp6TN+iKiErloXejXtQbMdZ6iGAxOKehxd/ry+KQd9hOFXIfSJkziYSrHvUuBoPWQcrTwe2oTxvXjeEvCwrzbZfQ4xl8TLwvvaOoAZABU/egW8bax+dzMID8KpPFjs4RRh8spoqoJPdYrR1HqjWQ4nS1Tlm8HJGXfAavKCVL3VbNlqWgbFK1Zlq8nyHTtW02ZTICiDzObwrm3DeSR0cdoD6hnF8cK1E8QR1EfMf3c+Du4CYcuKX9MOkNFDMOENvG4KnoTeLAYgCfrAcZ038Ipx6Gnir1a+NwnAiysYfeK9To+rSphfynsw9I4Cq1EPrE6KRpicyq0O4KrWQpI9rirGVXQ0wtRri0ZwhmoaVvHqJChYNb7MZYKbLOx8o4aBdrhNmVUf8ojDknRRTLNmQc3u2nrrVez05p9uusIqL4DcSA6VAvVnzqROd+7Ct/vbwuRncw8P4w2zudzspz08vnoM8VQiuA7LFI30qo6InSYAgeXzQ2C1ua0i59d13YSnqkBt2uU1VVd2BjRRoqy9K1K2326ZIbjBoXe71e5ac0nvM5S707vKuwkotNYAxeTh7rojaGLie3YcDwDuIE0Noo+rfXKydZCxhB4MQBTjjF0h6IuD2CQ++wPxrOvG8daFC56RM1g4vx5WJNJXObTWWAWYxsvUNxJu+RAVYIx+tayLJPxQOiGlcpJb9GDnqAPTmktpd2AHWDdxpyaAOz/fPuAtL+2mBleIGlhg+IXFQ6X42X0KRrRSmBdqa9tO8FLTvZ2oaicYpaNbF1bmx6xoMpgtZofHPYf10I3GrEf7QQuDsvfdrPDVeTk1ynhMnTBy3D5wIUJ0uqybU3pYU+BCkL6nbgSuqpwpgTuonPcimOZdL2RjNMjlXiXJHBzOvqbnCianTf0apAdRHTh5SvhEwewJT8XHyzG2JL8f+bjqaz+nS1fMVt2dYlJ+iUbvYy69+7jujG8yof7AhDoYhymyrHwSJTXQiB7oHHjE6Hc1lTN/dEFH2ydSCqoAXgy5ERXQ1xEdsDkmtunpj5LnnNEpVUXlBm64xsgQ2PH0oLcdnIniU+viQloXgoDUCZAYPobww8gfksh+BMI/YR+bKVkF1ehRDmJWjX949cczagwROaQWNmoW5+i1Vo0al6ofyKYdP4u3K7WxsiTXVhzLG6s+y8XlKQHDdaPcDGP99c7rY0/TIJRlsEasBmm3PpZL7o7l7k5aciaMfnZXhLvkrR1nVvGopfqBZz2byKQJCDdxNUzBVv1J/ky0ghd+t0D2RNmpjGTrSDJyBeUN5hi7C1NY5Vj1mFMmq8iM1HDc7vif5+lAHvEkaaSLn2W5k5ZMTvzlQ4lESO/R6MHi0dpnWeZp33a3GPAp/gGPL6k1kNBVSRYApWNnOXYCu5fkMm4fV5Lr8vsEafuBtlB3F3+xx8yPqtcCs1qFM1jDgSL5XFUm7xSeH0uGV68Tbzmtzg9Vv2d43tHsgAA6n13PJg93l71NL4POeaHc2sA5aqZ/kHgnkukfdlZn/wc=&lt;/diagram&gt;&lt;/mxfile&gt;&quot;,&quot;toolbar&quot;:&quot;pages zoom layers lightbox&quot;,&quot;page&quot;:0}"></div>
10 | <script type="text/javascript" src="https://app.diagrams.net/js/viewer-static.min.js"></script>
11 | </body>
12 | </html>
13 | 


--------------------------------------------------------------------------------
/images/chapter02_transformers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_transformers.png


--------------------------------------------------------------------------------
/images/chapter02_tweet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_tweet.png


--------------------------------------------------------------------------------
/images/chapter03_attention-ops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_attention-ops.png


--------------------------------------------------------------------------------
/images/chapter03_contextualized-embedding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_contextualized-embedding.png


--------------------------------------------------------------------------------
/images/chapter03_decoder-zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_decoder-zoom.png


--------------------------------------------------------------------------------
/images/chapter03_encoder-zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_encoder-zoom.png


--------------------------------------------------------------------------------
/images/chapter03_layer-norm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_layer-norm.png


--------------------------------------------------------------------------------
/images/chapter03_multihead-attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_multihead-attention.png


--------------------------------------------------------------------------------
/images/chapter03_transformer-encoder-decoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_transformer-encoder-decoder.png


--------------------------------------------------------------------------------
/images/chapter03_transformers-compact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_transformers-compact.png


--------------------------------------------------------------------------------
/images/chapter04_bert-body-head.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter04_bert-body-head.png


--------------------------------------------------------------------------------
/images/chapter04_clf-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter04_clf-architecture.png


--------------------------------------------------------------------------------
/images/chapter04_ner-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter04_ner-architecture.png


--------------------------------------------------------------------------------
/images/chapter04_ner-widget.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter04_ner-widget.png


--------------------------------------------------------------------------------
/images/chapter04_tokenizer-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter04_tokenizer-pipeline.png


--------------------------------------------------------------------------------
/images/chapter05_beam-search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter05_beam-search.png


--------------------------------------------------------------------------------
/images/chapter05_lm-meta-learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter05_lm-meta-learning.png


--------------------------------------------------------------------------------
/images/chapter05_meena.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter05_meena.png


--------------------------------------------------------------------------------
/images/chapter05_text-generation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter05_text-generation.png


--------------------------------------------------------------------------------
/images/chapter07_dpr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_dpr.png


--------------------------------------------------------------------------------
/images/chapter07_marie-curie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_marie-curie.png


--------------------------------------------------------------------------------
/images/chapter07_phone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_phone.png


--------------------------------------------------------------------------------
/images/chapter07_qa-head.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_qa-head.png


--------------------------------------------------------------------------------
/images/chapter07_qa-pyramid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_qa-pyramid.png


--------------------------------------------------------------------------------
/images/chapter07_rag-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_rag-architecture.png


--------------------------------------------------------------------------------
/images/chapter07_retriever-reader.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_retriever-reader.png


--------------------------------------------------------------------------------
/images/chapter07_sliding-window.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_sliding-window.png


--------------------------------------------------------------------------------
/images/chapter07_squad-models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_squad-models.png


--------------------------------------------------------------------------------
/images/chapter07_squad-schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_squad-schema.png


--------------------------------------------------------------------------------
/images/chapter07_squad-sota.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_squad-sota.png


--------------------------------------------------------------------------------
/images/chapter08_bert-onnx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_bert-onnx.png


--------------------------------------------------------------------------------
/images/chapter08_fp32-to-int8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_fp32-to-int8.png


--------------------------------------------------------------------------------
/images/chapter08_kd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_kd.png


--------------------------------------------------------------------------------
/images/chapter08_magnitude-vs-movement.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_magnitude-vs-movement.png


--------------------------------------------------------------------------------
/images/chapter08_network-pruning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_network-pruning.png


--------------------------------------------------------------------------------
/images/chapter08_onnx-ort.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_onnx-ort.png


--------------------------------------------------------------------------------
/images/chapter08_oos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_oos.png


--------------------------------------------------------------------------------
/images/chapter08_pegasus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_pegasus.png


--------------------------------------------------------------------------------
/images/chapter08_pruning-dists.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_pruning-dists.png


--------------------------------------------------------------------------------
/images/chapter08_roblox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_roblox.png


--------------------------------------------------------------------------------
/images/chapter08_soft-probs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_soft-probs.png


--------------------------------------------------------------------------------
/images/chapter08_t5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_t5.png


--------------------------------------------------------------------------------
/images/chapter09_decision-tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter09_decision-tree.png


--------------------------------------------------------------------------------
/images/chapter09_faiss-index.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter09_faiss-index.png


--------------------------------------------------------------------------------
/images/chapter09_issue-example-v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter09_issue-example-v2.png


--------------------------------------------------------------------------------
/images/chapter09_nearest-neighbours.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter09_nearest-neighbours.png


--------------------------------------------------------------------------------
/images/chapter09_uda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter09_uda.png


--------------------------------------------------------------------------------
/images/chapter09_ust.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter09_ust.png


--------------------------------------------------------------------------------
/images/chapter10_code-snippet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter10_code-snippet.png


--------------------------------------------------------------------------------
/images/chapter10_ddp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter10_ddp.png


--------------------------------------------------------------------------------
/images/chapter10_preprocessing-clm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter10_preprocessing-clm.png


--------------------------------------------------------------------------------
/images/chapter10_pretraining-clm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter10_pretraining-clm.png


--------------------------------------------------------------------------------
/images/chapter10_pretraining-mlm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter10_pretraining-mlm.png


--------------------------------------------------------------------------------
/images/chapter10_pretraining-seq2seq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter10_pretraining-seq2seq.png


--------------------------------------------------------------------------------
/images/chapter11_atomic-sparse-attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_atomic-sparse-attention.png


--------------------------------------------------------------------------------
/images/chapter11_clip-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_clip-arch.png


--------------------------------------------------------------------------------
/images/chapter11_compound-sparse-attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_compound-sparse-attention.png


--------------------------------------------------------------------------------
/images/chapter11_dall-e.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_dall-e.png


--------------------------------------------------------------------------------
/images/chapter11_efficient-attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_efficient-attention.png


--------------------------------------------------------------------------------
/images/chapter11_iGPT.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_iGPT.png


--------------------------------------------------------------------------------
/images/chapter11_layoutlm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_layoutlm.png


--------------------------------------------------------------------------------
/images/chapter11_linear-attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_linear-attention.png


--------------------------------------------------------------------------------
/images/chapter11_scaling-modal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_scaling-modal.png


--------------------------------------------------------------------------------
/images/chapter11_scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_scaling.png


--------------------------------------------------------------------------------
/images/chapter11_table-qa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_table-qa.png


--------------------------------------------------------------------------------
/images/chapter11_tapas-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_tapas-architecture.png


--------------------------------------------------------------------------------
/images/chapter11_vit-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_vit-architecture.png


--------------------------------------------------------------------------------
/images/chapter11_vqa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_vqa.png


--------------------------------------------------------------------------------
/images/chapter11_wav2vec-u.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_wav2vec-u.png


--------------------------------------------------------------------------------
/images/chapter11_wav2vec2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_wav2vec2.png


--------------------------------------------------------------------------------
/images/doge.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/doge.jpg


--------------------------------------------------------------------------------
/images/optimusprime.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/optimusprime.jpg


--------------------------------------------------------------------------------
/install.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | 
 4 | is_colab = "google.colab" in sys.modules
 5 | is_kaggle = "kaggle_secrets" in sys.modules
 6 | # torch-scatter binaries depend on the torch and CUDA version, so we define the
 7 | # mappings here for Colab & Kaggle
 8 | torch_to_cuda = {"1.10.0": "cu113", "1.9.0": "cu111", "1.9.1": "cu111"}
 9 | 
10 | 
11 | def install_requirements(
12 |     is_chapter2: bool = False, 
13 |     is_chapter6: bool = False,
14 |     is_chapter7: bool = False,
15 |     is_chapter7_v2: bool = False,
16 |     is_chapter10: bool = False,
17 |     is_chapter11: bool = False
18 |     ):
19 |     """Installs the required packages for the project."""
20 | 
21 |     print("⏳ Installing base requirements ...")
22 |     cmd = ["python", "-m", "pip", "install", "-r"]
23 |     if is_chapter7:
24 |         cmd += "requirements-chapter7.txt -f https://download.pytorch.org/whl/torch_stable.html".split()
25 |     elif is_chapter7_v2:
26 |         cmd.append("requirements-chapter7-v2.txt")
27 |     else:
28 |         cmd.append("requirements.txt")
29 |     process_install = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
30 |     if process_install.returncode != 0:
31 |         raise Exception("😭 Failed to install base requirements")
32 |     else:
33 |         print("✅ Base requirements installed!")
34 |     print("⏳ Installing Git LFS ...")
35 |     process_lfs = subprocess.run(["apt", "install", "git-lfs"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
36 |     if process_lfs.returncode == -1:
37 |         raise Exception("😭 Failed to install Git LFS and soundfile")
38 |     else:
39 |         print("✅ Git LFS installed!")
40 | 
41 |     if is_chapter2:
42 |         transformers_cmd = "python -m pip install transformers==4.13.0 datasets==2.8.0".split()
43 |         process_scatter = subprocess.run(
44 |             transformers_cmd,
45 |             stdout=subprocess.PIPE,
46 |             stderr=subprocess.PIPE,
47 |         )
48 | 
49 |     if is_chapter6:
50 |         transformers_cmd = "python -m pip install datasets==2.0.0".split()
51 |         process_scatter = subprocess.run(
52 |             transformers_cmd,
53 |             stdout=subprocess.PIPE,
54 |             stderr=subprocess.PIPE,
55 |         )
56 | 
57 |     if is_chapter10:
58 |         wandb_cmd = "python -m pip install wandb".split()
59 |         process_scatter = subprocess.run(
60 |             wandb_cmd,
61 |             stdout=subprocess.PIPE,
62 |             stderr=subprocess.PIPE,
63 |         )
64 |     if is_chapter11:
65 |         import torch
66 | 
67 |         torch_version = torch.__version__.split("+")[0]
68 |         print(f"⏳ Installing torch-scatter for torch v{torch_version} ...")
69 |         if is_colab:
70 |             torch_scatter_cmd = f"python -m pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch_version}+{torch_to_cuda[torch_version]}.html".split()
71 |         else:
72 |             # Kaggle uses CUDA 11.0 by default, so we need to build from source
73 |             torch_scatter_cmd = "python -m pip install torch-scatter".split()
74 |         process_scatter = subprocess.run(
75 |             torch_scatter_cmd,
76 |             stdout=subprocess.PIPE,
77 |             stderr=subprocess.PIPE,
78 |         )
79 |         if process_scatter.returncode == -1:
80 |             raise Exception("😭 Failed to install torch-scatter")
81 |         else:
82 |             print("torch-scatter installed!")
83 |         print("⏳ Installing soundfile ...")
84 |         process_audio = subprocess.run(
85 |             ["apt", "install", "libsndfile1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
86 |         )
87 |         if process_audio.returncode == -1:
88 |             raise Exception("😭 Failed to install soundfile")
89 |         else:
90 |             print("✅ soundfile installed!")
91 |         print("🥳 Chapter installation complete!")
92 | 


--------------------------------------------------------------------------------
/plotting.mplstyle:
--------------------------------------------------------------------------------
1 | savefig.dpi: 300
2 | figure.figsize: 6, 4  # figure size in inches
3 | 
4 | axes.prop_cycle: cycler('color', ['0071bc', 'f7931e', 'c1272d', '009245', 'ffde00', '9900cc'])
5 | 
6 | font.size: 12.0
7 | font.family: Guardian Sans Cond
8 | pdf.fonttype: 42
9 | ps.fonttype: 42


--------------------------------------------------------------------------------
/requirements-chapter7-v2.txt:
--------------------------------------------------------------------------------
1 | # Base requirements
2 | farm-haystack[colab]==1.4.0
3 | matplotlib
4 | datasets
5 | 


--------------------------------------------------------------------------------
/requirements-chapter7.txt:
--------------------------------------------------------------------------------
 1 | # Base requirements
 2 | farm-haystack==0.9.0
 3 | matplotlib
 4 | # Colab fix since FARM requires PyTorch v1.8.1 but v1.10.0 is installed by
 5 | # default. See: https://github.com/deepset-ai/haystack/issues/1787
 6 | torch==1.8.1+cu111
 7 | torchvision==0.9.1+cu111
 8 | torchaudio==0.8.1
 9 | # We need this version because haystack pins 
10 | # transformers 4.6.1 which depends on huggingface-hub==0.0.8
11 | # and is incompatible with later versions of datasets
12 | datasets==1.11.0
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Base requirements
 2 | transformers[tf,torch,sentencepiece,vision,optuna,sklearn,onnxruntime]==4.16.2
 3 | datasets[audio]==1.16.1
 4 | matplotlib
 5 | ipywidgets
 6 | # Chapter 2 - Classification
 7 | umap-learn==0.5.1
 8 | # Chapter 3 - Anatomy
 9 | bertviz==1.2.0
10 | # Chapter 4 - NER
11 | seqeval==1.2.2
12 | # Chapter 6 - Summarization
13 | nltk==3.9
14 | sacrebleu==1.5.1
15 | rouge-score==0.0.4
16 | py7zr # Needed for samsum dataset
17 | # Chapter 9 - Few labels
18 | nlpaug==1.1.7
19 | scikit-multilearn==0.2.0
20 | # Chapter 10 - Pretraining
21 | psutil
22 | accelerate==0.5.1
23 | 


--------------------------------------------------------------------------------
/scripts/create_notebook_table.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | GITHUB_PATH_PREFIX = "nlp-with-transformers/notebooks/blob/main/"
 4 | 
 5 | CHAPTER_TO_NB = {
 6 |     "Introduction": "01_introduction",
 7 |     "Text Classification": "02_classification",
 8 |     "Transformer Anatomy": "03_transformer-anatomy",
 9 |     "Multilingual Named Entity Recognition": "04_multilingual-ner",
10 |     "Text Generation": "05_text-generation",
11 |     "Summarization": "06_summarization",
12 |     "Question Answering": "07_question-answering",
13 |     "Making Transformers Efficient in Production": "08_model-compression",
14 |     "Dealing with Few to No Labels": "09_few-to-no-labels",
15 |     "Training Transformers from Scratch": "10_transformers-from-scratch",
16 |     "Future Directions": "11_future-directions",
17 | }
18 | 
19 | 
20 | def _find_text_in_file(filename, start_prompt, end_prompt):
21 |     """
22 |     Find the text in `filename` between a line beginning with `start_prompt` and before `end_prompt`, removing empty
23 |     lines.
24 | 
25 |     Copied from: https://github.com/huggingface/transformers/blob/16f0b7d72c6d4e122957392c342b074aa2c5c519/utils/check_table.py#L30
26 |     """
27 |     with open(filename, "r", encoding="utf-8", newline="\n") as f:
28 |         lines = f.readlines()
29 |     # Find the start prompt.
30 |     start_index = 0
31 |     while not lines[start_index].startswith(start_prompt):
32 |         start_index += 1
33 |     start_index += 1
34 | 
35 |     end_index = start_index
36 |     while not lines[end_index].startswith(end_prompt):
37 |         end_index += 1
38 |     end_index -= 1
39 | 
40 |     while len(lines[start_index]) <= 1:
41 |         start_index += 1
42 |     while len(lines[end_index]) <= 1:
43 |         end_index -= 1
44 |     end_index += 1
45 |     return "".join(lines[start_index:end_index]), start_index, end_index, lines
46 | 
47 | 
48 | def create_table():
49 |     data = {"Chapter": [], "Colab": [], "Kaggle": [], "Gradient": [], "Studio Lab": []}
50 |     for title, nb in CHAPTER_TO_NB.items():
51 |         nb_path = f"{GITHUB_PATH_PREFIX}{nb}.ipynb"
52 |         data["Chapter"].append(title)
53 |         data["Colab"].append(
54 |             f"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/{nb_path})"
55 |         )
56 |         data["Kaggle"].append(
57 |             f"[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/{nb_path})"
58 |         )
59 |         data["Gradient"].append(
60 |             f"[![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/{nb_path})"
61 |         )
62 |         data["Studio Lab"].append(
63 |             f"[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/{nb_path})"
64 |         )
65 |     return pd.DataFrame(data).to_markdown(index=False) + "\n"
66 | 
67 | 
68 | def main():
69 |     table = create_table()
70 |     _, start_index, end_index, lines = _find_text_in_file(
71 |         filename="README.md",
72 |         start_prompt="<!--This table is automatically generated, do not fill manually!-->",
73 |         end_prompt="<!--End of table-->",
74 |     )
75 | 
76 |     with open("README.md", "w", encoding="utf-8", newline="\n") as f:
77 |         f.writelines(lines[:start_index] + [table] + lines[end_index:])
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/settings.ini:
--------------------------------------------------------------------------------
 1 | [DEFAULT]
 2 | lib_name = notebooks
 3 | user = nlp-with-transformers
 4 | description = notebooks
 5 | keywords = jupyter notebook asciidoc
 6 | author = Lewis Tunstall and Leandro von Werra and Thomas Wolf
 7 | nbs_path = .
 8 | host = github
 9 | doc_host = https://nlp-with-transformers.github.io
10 | doc_baseurl = /notebooks/
11 | 
12 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | from textwrap import TextWrapper
 4 | 
 5 | import datasets
 6 | import huggingface_hub
 7 | import matplotlib.font_manager as font_manager
 8 | import matplotlib.pyplot as plt
 9 | import torch
10 | import transformers
11 | from IPython.display import set_matplotlib_formats
12 | 
13 | # TODO: Consider adding SageMaker StudioLab
14 | is_colab = "google.colab" in sys.modules
15 | is_kaggle = "kaggle_secrets" in sys.modules
16 | is_gpu_available = torch.cuda.is_available()
17 | 
18 | 
19 | def install_mpl_fonts():
20 |     font_dir = ["./orm_fonts/"]
21 |     for font in font_manager.findSystemFonts(font_dir):
22 |         font_manager.fontManager.addfont(font)
23 | 
24 | 
25 | def set_plot_style():
26 |     install_mpl_fonts()
27 |     set_matplotlib_formats("pdf", "svg")
28 |     plt.style.use("plotting.mplstyle")
29 |     logging.getLogger("matplotlib").setLevel(level=logging.ERROR)
30 | 
31 | 
32 | def display_library_version(library):
33 |     print(f"Using {library.__name__} v{library.__version__}")
34 | 
35 | 
36 | def setup_chapter():
37 |     # Check if we have a GPU
38 |     if not is_gpu_available:
39 |         print("No GPU was detected! This notebook can be *very* slow without a GPU 🐢")
40 |         if is_colab:
41 |             print("Go to Runtime > Change runtime type and select a GPU hardware accelerator.")
42 |         if is_kaggle:
43 |             print("Go to Settings > Accelerator and select GPU.")
44 |     # Give visibility on versions of the core libraries
45 |     display_library_version(transformers)
46 |     display_library_version(datasets)
47 |     # Disable all info / warning messages
48 |     transformers.logging.set_verbosity_error()
49 |     datasets.logging.set_verbosity_error()
50 |     # Logging is only available for the chapters that don't depend on Haystack
51 |     if huggingface_hub.__version__ == "0.0.19":
52 |         huggingface_hub.logging.set_verbosity_error()
53 |     # Use O'Reilly style for plots
54 |     set_plot_style()
55 | 
56 | 
57 | def wrap_print_text(print):
58 |     """Adapted from: https://stackoverflow.com/questions/27621655/how-to-overload-print-function-to-expand-its-functionality/27621927"""
59 | 
60 |     def wrapped_func(text):
61 |         if not isinstance(text, str):
62 |             text = str(text)
63 |         wrapper = TextWrapper(
64 |             width=80,
65 |             break_long_words=True,
66 |             break_on_hyphens=False,
67 |             replace_whitespace=False,
68 |         )
69 |         return print("\n".join(wrapper.fill(line) for line in text.split("\n")))
70 | 
71 |     return wrapped_func
72 | 
73 | 
74 | print = wrap_print_text(print)
75 | 


--------------------------------------------------------------------------------