├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── question-about-the-book.md ├── .gitignore ├── 01_introduction.ipynb ├── 02_classification.ipynb ├── 03_transformer-anatomy.ipynb ├── 04_multilingual-ner.ipynb ├── 05_text-generation.ipynb ├── 06_summarization.ipynb ├── 07_question-answering.ipynb ├── 07_question_answering_v2.ipynb ├── 08_model-compression.ipynb ├── 09_few-to-no-labels.ipynb ├── 10_transformers-from-scratch.ipynb ├── 11_future-directions.ipynb ├── LICENSE ├── README.md ├── SageMaker ├── 01_introduction.ipynb ├── 02_classification.ipynb ├── README.md ├── images │ ├── git_repo.png │ ├── iam_role.png │ └── notebook_config.png ├── scripts │ └── 02_classification_train.py └── utils.py ├── data └── github-issues-transformers.jsonl ├── environment-chapter7.yml ├── environment.yml ├── images ├── book_cover.jpg ├── chapter01_enc-dec-attn.png ├── chapter01_enc-dec.png ├── chapter01_hf-ecosystem.png ├── chapter01_hub-model-card.png ├── chapter01_hub-overview.png ├── chapter01_rnn.png ├── chapter01_self-attention.png ├── chapter01_timeline.png ├── chapter01_transfer-learning.png ├── chapter01_ulmfit.png ├── chapter02_attention-alignment.png ├── chapter02_attention-mask.png ├── chapter02_encoder-classifier.png ├── chapter02_encoder-feature-based.png ├── chapter02_encoder-fine-tuning.png ├── chapter02_hf-libraries.png ├── chapter02_transformers-compact.html ├── chapter02_transformers.html ├── chapter02_transformers.png ├── chapter02_tweet.png ├── chapter03_attention-ops.png ├── chapter03_contextualized-embedding.png ├── chapter03_decoder-zoom.png ├── chapter03_encoder-zoom.png ├── chapter03_layer-norm.png ├── chapter03_multihead-attention.png ├── chapter03_transformer-encoder-decoder.png ├── chapter03_transformers-compact.png ├── chapter04_bert-body-head.png ├── chapter04_clf-architecture.png ├── chapter04_ner-architecture.png ├── chapter04_ner-widget.png ├── chapter04_tokenizer-pipeline.png ├── chapter05_beam-search.png ├── chapter05_lm-meta-learning.png ├── chapter05_meena.png ├── chapter05_text-generation.png ├── chapter07_dpr.png ├── chapter07_marie-curie.png ├── chapter07_phone.png ├── chapter07_qa-head.png ├── chapter07_qa-pyramid.png ├── chapter07_rag-architecture.png ├── chapter07_retriever-reader.png ├── chapter07_sliding-window.png ├── chapter07_squad-models.png ├── chapter07_squad-schema.png ├── chapter07_squad-sota.png ├── chapter08_bert-onnx.png ├── chapter08_fp32-to-int8.png ├── chapter08_kd.png ├── chapter08_magnitude-vs-movement.png ├── chapter08_network-pruning.png ├── chapter08_onnx-ort.png ├── chapter08_oos.png ├── chapter08_pegasus.png ├── chapter08_pruning-dists.png ├── chapter08_roblox.png ├── chapter08_soft-probs.png ├── chapter08_t5.png ├── chapter09_decision-tree.png ├── chapter09_faiss-index.png ├── chapter09_issue-example-v2.png ├── chapter09_nearest-neighbours.png ├── chapter09_uda.png ├── chapter09_ust.png ├── chapter10_code-snippet.png ├── chapter10_ddp.png ├── chapter10_preprocessing-clm.png ├── chapter10_pretraining-clm.png ├── chapter10_pretraining-mlm.png ├── chapter10_pretraining-seq2seq.png ├── chapter11_atomic-sparse-attention.png ├── chapter11_clip-arch.png ├── chapter11_compound-sparse-attention.png ├── chapter11_dall-e.png ├── chapter11_efficient-attention.png ├── chapter11_iGPT.png ├── chapter11_layoutlm.png ├── chapter11_linear-attention.png ├── chapter11_scaling-modal.png ├── chapter11_scaling.png ├── chapter11_table-qa.png ├── chapter11_tapas-architecture.png ├── chapter11_vit-architecture.png ├── chapter11_vqa.png ├── chapter11_wav2vec-u.png ├── chapter11_wav2vec2.png ├── doge.jpg └── optimusprime.jpg ├── install.py ├── plotting.mplstyle ├── requirements-chapter7-v2.txt ├── requirements-chapter7.txt ├── requirements.txt ├── scripts └── create_notebook_table.py ├── settings.ini └── utils.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Submit a report to help us improve the book 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Information 11 | 12 | The problem arises in chapter: 13 | 14 | * [ ] Introduction 15 | * [ ] Text Classification 16 | * [ ] Transformer Anatomy 17 | * [ ] Multilingual Named Entity Recognition 18 | * [ ] Text Generation 19 | * [ ] Summarization 20 | * [ ] Question Answering 21 | * [ ] Making Transformers Efficient in Production 22 | * [ ] Dealing with Few to No Labels 23 | * [ ] Training Transformers from Scratch 24 | * [ ] Future Directions 25 | 26 | 27 | ## Describe the bug 28 | 29 | 30 | ## To Reproduce 31 | Steps to reproduce the behavior: 32 | 33 | 1. 34 | 2. 35 | 3. 36 | 37 | 40 | 41 | ## Expected behavior 42 | 43 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question-about-the-book.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question or comment about the book 3 | about: Have a general question or comment about the book content? Ask it here! 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Information 11 | 12 | The question or comment is about chapter: 13 | 14 | * [ ] Introduction 15 | * [ ] Text Classification 16 | * [ ] Transformer Anatomy 17 | * [ ] Multilingual Named Entity Recognition 18 | * [ ] Text Generation 19 | * [ ] Summarization 20 | * [ ] Question Answering 21 | * [ ] Making Transformers Efficient in Production 22 | * [ ] Dealing with Few to No Labels 23 | * [ ] Training Transformers from Scratch 24 | * [ ] Future Directions 25 | 26 | ## Question or comment -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # mac 132 | .DS_Store 133 | -------------------------------------------------------------------------------- /01_introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Uncomment and run this cell if you're on Colab or Kaggle\n", 10 | "# !git clone https://github.com/nlp-with-transformers/notebooks.git\n", 11 | "# %cd notebooks\n", 12 | "# from install import *\n", 13 | "# install_requirements()" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "#hide\n", 23 | "from utils import *\n", 24 | "setup_chapter()" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "# Hello Transformers" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "\"transformer-timeline\"" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## The Encoder-Decoder Framework" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "\"rnn\"" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "\"enc-dec\"" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Attention Mechanisms" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "\"enc-dec-attn\" " 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "\"attention-alignment\" " 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "\"transformer-self-attn\" " 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "## Transfer Learning in NLP" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "\"transfer-learning\" " 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "\"ulmfit\"" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "## Hugging Face Transformers: Bridging the Gap" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "## A Tour of Transformer Applications" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "text = \"\"\"Dear Amazon, last week I ordered an Optimus Prime action figure \\\n", 132 | "from your online store in Germany. Unfortunately, when I opened the package, \\\n", 133 | "I discovered to my horror that I had been sent an action figure of Megatron \\\n", 134 | "instead! As a lifelong enemy of the Decepticons, I hope you can understand my \\\n", 135 | "dilemma. To resolve the issue, I demand an exchange of Megatron for the \\\n", 136 | "Optimus Prime figure I ordered. Enclosed are copies of my records concerning \\\n", 137 | "this purchase. I expect to hear from you soon. Sincerely, Bumblebee.\"\"\"" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "### Text Classification" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "#hide_output\n", 154 | "from transformers import pipeline\n", 155 | "\n", 156 | "classifier = pipeline(\"text-classification\")" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/html": [ 167 | "
\n", 168 | "\n", 181 | "\n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | "
labelscore
0NEGATIVE0.901546
\n", 197 | "
" 198 | ], 199 | "text/plain": [ 200 | " label score\n", 201 | "0 NEGATIVE 0.901546" 202 | ] 203 | }, 204 | "execution_count": null, 205 | "metadata": {}, 206 | "output_type": "execute_result" 207 | } 208 | ], 209 | "source": [ 210 | "import pandas as pd\n", 211 | "\n", 212 | "outputs = classifier(text)\n", 213 | "pd.DataFrame(outputs) " 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "### Named Entity Recognition" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "data": { 230 | "text/html": [ 231 | "
\n", 232 | "\n", 245 | "\n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | "
entity_groupscorewordstartend
0ORG0.879010Amazon511
1MISC0.990859Optimus Prime3649
2LOC0.999755Germany9097
3MISC0.556569Mega208212
4PER0.590256##tron212216
5ORG0.669692Decept253259
6MISC0.498350##icons259264
7MISC0.775361Megatron350358
8MISC0.987854Optimus Prime367380
9PER0.812096Bumblebee502511
\n", 339 | "
" 340 | ], 341 | "text/plain": [ 342 | " entity_group score word start end\n", 343 | "0 ORG 0.879010 Amazon 5 11\n", 344 | "1 MISC 0.990859 Optimus Prime 36 49\n", 345 | "2 LOC 0.999755 Germany 90 97\n", 346 | "3 MISC 0.556569 Mega 208 212\n", 347 | "4 PER 0.590256 ##tron 212 216\n", 348 | "5 ORG 0.669692 Decept 253 259\n", 349 | "6 MISC 0.498350 ##icons 259 264\n", 350 | "7 MISC 0.775361 Megatron 350 358\n", 351 | "8 MISC 0.987854 Optimus Prime 367 380\n", 352 | "9 PER 0.812096 Bumblebee 502 511" 353 | ] 354 | }, 355 | "execution_count": null, 356 | "metadata": {}, 357 | "output_type": "execute_result" 358 | } 359 | ], 360 | "source": [ 361 | "ner_tagger = pipeline(\"ner\", aggregation_strategy=\"simple\")\n", 362 | "outputs = ner_tagger(text)\n", 363 | "pd.DataFrame(outputs) " 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "### Question Answering " 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": {}, 377 | "outputs": [ 378 | { 379 | "data": { 380 | "text/html": [ 381 | "
\n", 382 | "\n", 395 | "\n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | "
scorestartendanswer
00.631291335358an exchange of Megatron
\n", 415 | "
" 416 | ], 417 | "text/plain": [ 418 | " score start end answer\n", 419 | "0 0.631291 335 358 an exchange of Megatron" 420 | ] 421 | }, 422 | "execution_count": null, 423 | "metadata": {}, 424 | "output_type": "execute_result" 425 | } 426 | ], 427 | "source": [ 428 | "reader = pipeline(\"question-answering\")\n", 429 | "question = \"What does the customer want?\"\n", 430 | "outputs = reader(question=question, context=text)\n", 431 | "pd.DataFrame([outputs]) " 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "### Summarization" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": {}, 445 | "outputs": [ 446 | { 447 | "name": "stdout", 448 | "output_type": "stream", 449 | "text": [ 450 | " Bumblebee ordered an Optimus Prime action figure from your online store in\n", 451 | "Germany. Unfortunately, when I opened the package, I discovered to my horror\n", 452 | "that I had been sent an action figure of Megatron instead.\n" 453 | ] 454 | } 455 | ], 456 | "source": [ 457 | "summarizer = pipeline(\"summarization\")\n", 458 | "outputs = summarizer(text, max_length=45, clean_up_tokenization_spaces=True)\n", 459 | "print(outputs[0]['summary_text'])" 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "metadata": {}, 465 | "source": [ 466 | "### Translation" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": {}, 473 | "outputs": [ 474 | { 475 | "name": "stdout", 476 | "output_type": "stream", 477 | "text": [ 478 | "Sehr geehrter Amazon, letzte Woche habe ich eine Optimus Prime Action Figur aus\n", 479 | "Ihrem Online-Shop in Deutschland bestellt. Leider, als ich das Paket öffnete,\n", 480 | "entdeckte ich zu meinem Entsetzen, dass ich stattdessen eine Action Figur von\n", 481 | "Megatron geschickt worden war! Als lebenslanger Feind der Decepticons, Ich\n", 482 | "hoffe, Sie können mein Dilemma verstehen. Um das Problem zu lösen, Ich fordere\n", 483 | "einen Austausch von Megatron für die Optimus Prime Figur habe ich bestellt.\n", 484 | "Anbei sind Kopien meiner Aufzeichnungen über diesen Kauf. Ich erwarte, bald von\n", 485 | "Ihnen zu hören. Aufrichtig, Bumblebee.\n" 486 | ] 487 | } 488 | ], 489 | "source": [ 490 | "translator = pipeline(\"translation_en_to_de\", \n", 491 | " model=\"Helsinki-NLP/opus-mt-en-de\")\n", 492 | "outputs = translator(text, clean_up_tokenization_spaces=True, min_length=100)\n", 493 | "print(outputs[0]['translation_text'])" 494 | ] 495 | }, 496 | { 497 | "cell_type": "markdown", 498 | "metadata": {}, 499 | "source": [ 500 | "### Text Generation" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "#hide\n", 510 | "from transformers import set_seed\n", 511 | "set_seed(42) # Set the seed to get reproducible results" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": {}, 518 | "outputs": [ 519 | { 520 | "name": "stdout", 521 | "output_type": "stream", 522 | "text": [ 523 | "Dear Amazon, last week I ordered an Optimus Prime action figure from your online\n", 524 | "store in Germany. Unfortunately, when I opened the package, I discovered to my\n", 525 | "horror that I had been sent an action figure of Megatron instead! As a lifelong\n", 526 | "enemy of the Decepticons, I hope you can understand my dilemma. To resolve the\n", 527 | "issue, I demand an exchange of Megatron for the Optimus Prime figure I ordered.\n", 528 | "Enclosed are copies of my records concerning this purchase. I expect to hear\n", 529 | "from you soon. Sincerely, Bumblebee.\n", 530 | "\n", 531 | "Customer service response:\n", 532 | "Dear Bumblebee, I am sorry to hear that your order was mixed up. The order was\n", 533 | "completely mislabeled, which is very common in our online store, but I can\n", 534 | "appreciate it because it was my understanding from this site and our customer\n", 535 | "service of the previous day that your order was not made correct in our mind and\n", 536 | "that we are in a process of resolving this matter. We can assure you that your\n", 537 | "order\n" 538 | ] 539 | } 540 | ], 541 | "source": [ 542 | "generator = pipeline(\"text-generation\")\n", 543 | "response = \"Dear Bumblebee, I am sorry to hear that your order was mixed up.\"\n", 544 | "prompt = text + \"\\n\\nCustomer service response:\\n\" + response\n", 545 | "outputs = generator(prompt, max_length=200)\n", 546 | "print(outputs[0]['generated_text'])" 547 | ] 548 | }, 549 | { 550 | "cell_type": "markdown", 551 | "metadata": {}, 552 | "source": [ 553 | "## The Hugging Face Ecosystem" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "\"ecosystem\"" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": {}, 566 | "source": [ 567 | "### The Hugging Face Hub" 568 | ] 569 | }, 570 | { 571 | "cell_type": "markdown", 572 | "metadata": {}, 573 | "source": [ 574 | "\"hub-overview\" " 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": {}, 580 | "source": [ 581 | "\"hub-model-card\" " 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": {}, 587 | "source": [ 588 | "### Hugging Face Tokenizers" 589 | ] 590 | }, 591 | { 592 | "cell_type": "markdown", 593 | "metadata": {}, 594 | "source": [ 595 | "### Hugging Face Datasets" 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "metadata": {}, 601 | "source": [ 602 | "### Hugging Face Accelerate" 603 | ] 604 | }, 605 | { 606 | "cell_type": "markdown", 607 | "metadata": {}, 608 | "source": [ 609 | "## Main Challenges with Transformers" 610 | ] 611 | }, 612 | { 613 | "cell_type": "markdown", 614 | "metadata": {}, 615 | "source": [ 616 | "## Conclusion" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "metadata": {}, 623 | "outputs": [], 624 | "source": [] 625 | } 626 | ], 627 | "metadata": { 628 | "kernelspec": { 629 | "display_name": "Python 3 (ipykernel)", 630 | "language": "python", 631 | "name": "python3" 632 | } 633 | }, 634 | "nbformat": 4, 635 | "nbformat_minor": 4 636 | } 637 | -------------------------------------------------------------------------------- /10_transformers-from-scratch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Uncomment and run this cell if you're on Colab or Kaggle\n", 10 | "# !git clone https://github.com/nlp-with-transformers/notebooks.git\n", 11 | "# %cd notebooks\n", 12 | "# from install import *\n", 13 | "# install_requirements(is_chapter10=True)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# hide\n", 23 | "from utils import *\n", 24 | "setup_chapter()" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "# Training Transformers from Scratch" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "> **Note:** In this chapter a large dataset and the script to train a large language model on a distributed infrastructure are built. As such not all the steps in this notebook are executable on platforms such as Colab or Kaggle. Either downscale the steps at critical points or use this notebook as an inspiration when building a script for distributed training." 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Large Datasets and Where to Find Them" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Challenges of Building a Large-Scale Corpus" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "#hide_output\n", 62 | "from transformers import pipeline, set_seed\n", 63 | "\n", 64 | "generation_gpt = pipeline(\"text-generation\", model=\"openai-gpt\")\n", 65 | "generation_gpt2 = pipeline(\"text-generation\", model=\"gpt2\")" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "GPT size: 116.5M parameters\n", 78 | "GPT2 size: 124.4M parameters\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "def model_size(model):\n", 84 | " return sum(t.numel() for t in model.parameters())\n", 85 | "\n", 86 | "print(f\"GPT size: {model_size(generation_gpt.model)/1000**2:.1f}M parameters\")\n", 87 | "print(f\"GPT2 size: {model_size(generation_gpt2.model)/1000**2:.1f}M parameters\")" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# hide\n", 97 | "set_seed(1)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "GPT completions:\n", 110 | "1.\n", 111 | "When they came back.\n", 112 | " \" we need all we can get, \" jason said once they had settled into the back of\n", 113 | "the truck without anyone stopping them. \" after getting out here, it 'll be up\n", 114 | "to us what to find. for now\n", 115 | "2.\n", 116 | "When they came back.\n", 117 | " his gaze swept over her body. he 'd dressed her, too, in the borrowed clothes\n", 118 | "that she 'd worn for the journey.\n", 119 | " \" i thought it would be easier to just leave you there. \" a woman like\n", 120 | "3.\n", 121 | "When they came back to the house and she was sitting there with the little boy.\n", 122 | " \" don't be afraid, \" he told her. she nodded slowly, her eyes wide. she was so\n", 123 | "lost in whatever she discovered that tom knew her mistake\n", 124 | "\n", 125 | "GPT-2 completions:\n", 126 | "1.\n", 127 | "When they came back we had a big dinner and the other guys went to see what\n", 128 | "their opinion was on her. I did an hour and they were happy with it.\n", 129 | "2.\n", 130 | "When they came back to this island there had been another massacre, but he could\n", 131 | "not help but feel pity for the helpless victim who had been left to die, and\n", 132 | "that they had failed that day. And so was very, very grateful indeed.\n", 133 | "3.\n", 134 | "When they came back to our house after the morning, I asked if she was sure. She\n", 135 | "said, \"Nope.\" The two kids were gone that morning. I thought they were back to\n", 136 | "being a good friend.\n", 137 | "\n", 138 | "When Dost\n" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "def enum_pipeline_ouputs(pipe, prompt, num_return_sequences):\n", 144 | " out = pipe(prompt, num_return_sequences=num_return_sequences,\n", 145 | " clean_up_tokenization_spaces=True)\n", 146 | " return \"\\n\".join(f\"{i+1}.\" + s[\"generated_text\"] for i, s in enumerate(out))\n", 147 | "\n", 148 | "prompt = \"\\nWhen they came back\"\n", 149 | "print(\"GPT completions:\\n\" + enum_pipeline_ouputs(generation_gpt, prompt, 3))\n", 150 | "print(\"\")\n", 151 | "print(\"GPT-2 completions:\\n\" + enum_pipeline_ouputs(generation_gpt2, prompt, 3))" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "### Building a Custom Code Dataset\n" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "#### Creating a dataset with Google BigQuery" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "#sidebar To Filter the Noise or Not?" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### Working with Large Datasets" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "#### Memory mapping" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "> **Note:** The following code block assumes that you have downloaded the BigQuery dataset to a folder called `codeparrot`. We suggest skipping this step since it will unpack the compressed files and require ~180GB of disk space. This code is just for demonstration purposes and you can just continue below with the streamed dataset which will not consume that much disk space." 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "#hide_output\n", 203 | "from datasets import load_dataset, DownloadConfig\n", 204 | "\n", 205 | "download_config = DownloadConfig(delete_extracted=True)\n", 206 | "dataset = load_dataset(\"./codeparrot\", split=\"train\",\n", 207 | " download_config=download_config)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "Number of python files code in dataset : 18695559\n", 220 | "Dataset size (cache file) : 183.68 GB\n", 221 | "RAM memory used: 4924 MB\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "import psutil, os\n", 227 | "\n", 228 | "print(f\"Number of python files code in dataset : {len(dataset)}\")\n", 229 | "ds_size = sum(os.stat(f[\"filename\"]).st_size for f in dataset.cache_files)\n", 230 | "# os.stat.st_size is expressed in bytes, so we convert to GB\n", 231 | "print(f\"Dataset size (cache file) : {ds_size / 2**30:.2f} GB\")\n", 232 | "# Process.memory_info is expressed in bytes, so we convert to MB\n", 233 | "print(f\"RAM used: {psutil.Process(os.getpid()).memory_info().rss >> 20} MB\")" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "#### Streaming" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "name": "stderr", 250 | "output_type": "stream", 251 | "text": [ 252 | "Using custom data configuration default-cae7a1d2f0dbde67\n" 253 | ] 254 | } 255 | ], 256 | "source": [ 257 | "# hide_output\n", 258 | "streamed_dataset = load_dataset('./codeparrot', split=\"train\", streaming=True)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "name": "stdout", 268 | "output_type": "stream", 269 | "text": [ 270 | "True\n", 271 | "True\n" 272 | ] 273 | } 274 | ], 275 | "source": [ 276 | "iterator = iter(streamed_dataset)\n", 277 | "\n", 278 | "print(dataset[0] == next(iterator))\n", 279 | "print(dataset[1] == next(iterator))" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "remote_dataset = load_dataset('transformersbook/codeparrot', split=\"train\",\n", 289 | " streaming=True)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "### Adding Datasets to the Hugging Face Hub" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "## Building a Tokenizer" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [ 311 | { 312 | "data": { 313 | "application/vnd.jupyter.widget-view+json": { 314 | "model_id": "29ced71e91434126970160a03cc006a5", 315 | "version_major": 2, 316 | "version_minor": 0 317 | }, 318 | "text/plain": [ 319 | "Downloading: 0%| | 0.00/1.17k [00:00\n", 570 | " \n", 571 | " \n", 572 | " Description\n", 573 | " Character\n", 574 | " Bytes\n", 575 | " Mapped bytes\n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " Regular characters\n", 581 | " `a` and `?`\n", 582 | " 97 and 63\n", 583 | " `a` and `?`\n", 584 | " \n", 585 | " \n", 586 | " Non-printable control character (CARRIAGE RETURN)\n", 587 | " `U+000D`\n", 588 | " 13\n", 589 | " `č`\n", 590 | " \n", 591 | " \n", 592 | " A space\n", 593 | " ` `\n", 594 | " 32\n", 595 | " `Ġ`\n", 596 | " \n", 597 | " \n", 598 | " A non-breakable space\n", 599 | " `\\xa0`\n", 600 | " 160\n", 601 | " `ł`\n", 602 | " \n", 603 | " \n", 604 | " A newline character\n", 605 | " `\\n`\n", 606 | " 10\n", 607 | " `Ċ`\n", 608 | " \n", 609 | " \n", 610 | "" 611 | ], 612 | "text/plain": [ 613 | "" 614 | ] 615 | }, 616 | "metadata": {}, 617 | "output_type": "display_data" 618 | } 619 | ], 620 | "source": [ 621 | "# hide_input\n", 622 | "#id unicode_mapping\n", 623 | "#caption Examples of character mappings in BPE\n", 624 | "#hide_input\n", 625 | "import pandas as pd\n", 626 | "from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode\n", 627 | "\n", 628 | "byte_to_unicode_map = bytes_to_unicode()\n", 629 | "unicode_to_byte_map = dict((v, k) for k, v in byte_to_unicode_map.items())\n", 630 | "base_vocab = list(unicode_to_byte_map.keys())\n", 631 | "\n", 632 | "examples = [\n", 633 | " ['Regular characters', '`a` and `?`', f'{ord(\"a\")} and {ord(\"?\")}' , f'`{byte_to_unicode_map[ord(\"a\")]}` and `{byte_to_unicode_map[ord(\"?\")]}`'],\n", 634 | " ['Nonprintable control character (carriage return)', '`U+000D`', f'13', f'`{byte_to_unicode_map[13]}`'],\n", 635 | " ['A space', '` `', f'{ord(\" \")}', f'`{byte_to_unicode_map[ord(\" \")]}`'],\n", 636 | " ['A nonbreakable space', '`\\\\xa0`', '160', f'`{byte_to_unicode_map[ord(chr(160))]}`'],\n", 637 | " ['A newline character', '`\\\\n`', '10', f'`{byte_to_unicode_map[ord(chr(10))]}`'],\n", 638 | "]\n", 639 | "\n", 640 | "pd.DataFrame(examples, columns = ['Description', 'Character', 'Bytes', 'Mapped bytes'])" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": null, 646 | "metadata": {}, 647 | "outputs": [ 648 | { 649 | "name": "stdout", 650 | "output_type": "stream", 651 | "text": [ 652 | "[('def', (0, 3)), ('Ġsay', (3, 7)), ('_', (7, 8)), ('hello', (8, 13)), ('():',\n", 653 | "(13, 16)), ('ĊĠĠĠ', (16, 20)), ('Ġprint', (20, 26)), ('(\"', (26, 28)), ('Hello',\n", 654 | "(28, 33)), (',', (33, 34)), ('ĠWorld', (34, 40)), ('!\")', (40, 43)), ('Ġ#', (43,\n", 655 | "45)), ('ĠPrint', (45, 51)), ('Ġit', (51, 54)), ('Ċ', (54, 55)), ('Ċ', (55, 56)),\n", 656 | "('say', (56, 59)), ('_', (59, 60)), ('hello', (60, 65)), ('()', (65, 67)), ('Ċ',\n", 657 | "(67, 68))]\n" 658 | ] 659 | } 660 | ], 661 | "source": [ 662 | "print(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(python_code))" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": null, 668 | "metadata": {}, 669 | "outputs": [ 670 | { 671 | "name": "stdout", 672 | "output_type": "stream", 673 | "text": [ 674 | "Size of the vocabulary: 50257\n" 675 | ] 676 | } 677 | ], 678 | "source": [ 679 | "print(f\"Size of the vocabulary: {len(tokenizer)}\")" 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": null, 685 | "metadata": {}, 686 | "outputs": [ 687 | { 688 | "name": "stdout", 689 | "output_type": "stream", 690 | "text": [ 691 | "['def', 'Ġsay', '_', 'hello', '():', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġprint', '(\"',\n", 692 | "'Hello', ',', 'ĠWorld', '!\"', ')', 'Ġ#', 'ĠPrint', 'Ġit', 'Ċ', 'Ċ', 'say', '_',\n", 693 | "'hello', '()', 'Ċ']\n" 694 | ] 695 | } 696 | ], 697 | "source": [ 698 | "print(tokenizer(python_code).tokens())" 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "metadata": {}, 704 | "source": [ 705 | "### Training a Tokenizer" 706 | ] 707 | }, 708 | { 709 | "cell_type": "code", 710 | "execution_count": null, 711 | "metadata": {}, 712 | "outputs": [ 713 | { 714 | "name": "stdout", 715 | "output_type": "stream", 716 | "text": [ 717 | "['ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ', '\n", 718 | "=================================================================', '\n", 719 | "----------------------------------------------------------------',\n", 720 | "'................................................................',\n", 721 | "'ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ',\n", 722 | "'----------------------------------------------------------------',\n", 723 | "'================================================================',\n", 724 | "'________________________________________________________________']\n" 725 | ] 726 | } 727 | ], 728 | "source": [ 729 | "tokens = sorted(tokenizer.vocab.items(), key=lambda x: len(x[0]), reverse=True)\n", 730 | "print([f'{tokenizer.convert_tokens_to_string(t)}' for t, _ in tokens[:8]]);" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": null, 736 | "metadata": {}, 737 | "outputs": [ 738 | { 739 | "name": "stdout", 740 | "output_type": "stream", 741 | "text": [ 742 | "['<|endoftext|>', ' gazed', ' informants', ' Collider', ' regress', 'ominated',\n", 743 | "' amplification', 'Compar', '….\"', ' (/', 'Commission', ' Hitman']\n" 744 | ] 745 | } 746 | ], 747 | "source": [ 748 | "tokens = sorted(tokenizer.vocab.items(), key=lambda x: x[1], reverse=True)\n", 749 | "print([f'{tokenizer.convert_tokens_to_string(t)}' for t, _ in tokens[:12]]);" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": null, 755 | "metadata": {}, 756 | "outputs": [ 757 | { 758 | "data": { 759 | "application/vnd.jupyter.widget-view+json": { 760 | "model_id": "743bca69d71649908db9ca5760af61d2", 761 | "version_major": 2, 762 | "version_minor": 0 763 | }, 764 | "text/plain": [ 765 | "Check remote data files: 0%| | 0/183 [00:00" 1061 | ] 1062 | }, 1063 | { 1064 | "cell_type": "markdown", 1065 | "metadata": {}, 1066 | "source": [ 1067 | "#### Causal language modeling" 1068 | ] 1069 | }, 1070 | { 1071 | "cell_type": "markdown", 1072 | "metadata": {}, 1073 | "source": [ 1074 | "\"CLM" 1075 | ] 1076 | }, 1077 | { 1078 | "cell_type": "markdown", 1079 | "metadata": {}, 1080 | "source": [ 1081 | "#### Masked language modeling" 1082 | ] 1083 | }, 1084 | { 1085 | "cell_type": "markdown", 1086 | "metadata": {}, 1087 | "source": [ 1088 | "\"MLM" 1089 | ] 1090 | }, 1091 | { 1092 | "cell_type": "markdown", 1093 | "metadata": {}, 1094 | "source": [ 1095 | "#### Sequence-to-sequence training" 1096 | ] 1097 | }, 1098 | { 1099 | "cell_type": "markdown", 1100 | "metadata": {}, 1101 | "source": [ 1102 | "\"Seq2seq" 1103 | ] 1104 | }, 1105 | { 1106 | "cell_type": "markdown", 1107 | "metadata": {}, 1108 | "source": [ 1109 | "### Initializing the Model" 1110 | ] 1111 | }, 1112 | { 1113 | "cell_type": "markdown", 1114 | "metadata": {}, 1115 | "source": [ 1116 | "> **NOTE**: In the following code block, a large GPT-2 checkpoint is loaded into memory. On platforms like Colab and Kaggle, this can cause the instance to crash due to insufficient RAM or GPU memory. You can still run the example if you use the small checkpoint by replacing the configuration with `config = AutoConfig.from_pretrained(\"gpt2\", vocab_size=len(tokenizer))`." 1117 | ] 1118 | }, 1119 | { 1120 | "cell_type": "code", 1121 | "execution_count": null, 1122 | "metadata": {}, 1123 | "outputs": [ 1124 | { 1125 | "data": { 1126 | "application/vnd.jupyter.widget-view+json": { 1127 | "model_id": "be84ca77ca144954af8ae4820ec6685b", 1128 | "version_major": 2, 1129 | "version_minor": 0 1130 | }, 1131 | "text/plain": [ 1132 | "Downloading: 0%| | 0.00/787 [00:00" 1235 | ] 1236 | }, 1237 | { 1238 | "cell_type": "code", 1239 | "execution_count": null, 1240 | "metadata": {}, 1241 | "outputs": [ 1242 | { 1243 | "name": "stderr", 1244 | "output_type": "stream", 1245 | "text": [ 1246 | " 0%| | 1/500 [00:00<01:16, 6.54it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2605 > 1024). Running this sequence through the model will result in indexing errors\n", 1247 | "100%|██████████| 500/500 [00:04<00:00, 122.59it/s]\n" 1248 | ] 1249 | } 1250 | ], 1251 | "source": [ 1252 | "#hide_output\n", 1253 | "examples, total_characters, total_tokens = 500, 0, 0\n", 1254 | "dataset = load_dataset('transformersbook/codeparrot-train', split='train',\n", 1255 | " streaming=True)\n", 1256 | "\n", 1257 | "for _, example in tqdm(zip(range(examples), iter(dataset)), total=examples):\n", 1258 | " total_characters += len(example['content'])\n", 1259 | " total_tokens += len(tokenizer(example['content']).tokens())\n", 1260 | "\n", 1261 | "characters_per_token = total_characters / total_tokens" 1262 | ] 1263 | }, 1264 | { 1265 | "cell_type": "code", 1266 | "execution_count": null, 1267 | "metadata": {}, 1268 | "outputs": [ 1269 | { 1270 | "name": "stdout", 1271 | "output_type": "stream", 1272 | "text": [ 1273 | "3.6233025034779565\n" 1274 | ] 1275 | } 1276 | ], 1277 | "source": [ 1278 | "print(characters_per_token)" 1279 | ] 1280 | }, 1281 | { 1282 | "cell_type": "code", 1283 | "execution_count": null, 1284 | "metadata": {}, 1285 | "outputs": [], 1286 | "source": [ 1287 | "import torch\n", 1288 | "from torch.utils.data import IterableDataset\n", 1289 | "\n", 1290 | "class ConstantLengthDataset(IterableDataset):\n", 1291 | " \n", 1292 | " def __init__(self, tokenizer, dataset, seq_length=1024,\n", 1293 | " num_of_sequences=1024, chars_per_token=3.6):\n", 1294 | " self.tokenizer = tokenizer\n", 1295 | " self.concat_token_id = tokenizer.eos_token_id\n", 1296 | " self.dataset = dataset\n", 1297 | " self.seq_length = seq_length\n", 1298 | " self.input_characters = seq_length * chars_per_token * num_of_sequences\n", 1299 | " \n", 1300 | " def __iter__(self):\n", 1301 | " iterator = iter(self.dataset)\n", 1302 | " more_examples = True\n", 1303 | " while more_examples:\n", 1304 | " buffer, buffer_len = [], 0\n", 1305 | " while True:\n", 1306 | " if buffer_len >= self.input_characters:\n", 1307 | " m=f\"Buffer full: {buffer_len}>={self.input_characters:.0f}\"\n", 1308 | " print(m)\n", 1309 | " break\n", 1310 | " try:\n", 1311 | " m=f\"Fill buffer: {buffer_len}<{self.input_characters:.0f}\"\n", 1312 | " print(m)\n", 1313 | " buffer.append(next(iterator)[\"content\"])\n", 1314 | " buffer_len += len(buffer[-1])\n", 1315 | " except StopIteration:\n", 1316 | " iterator = iter(self.dataset)\n", 1317 | "\n", 1318 | " all_token_ids = []\n", 1319 | " tokenized_inputs = self.tokenizer(buffer, truncation=False)\n", 1320 | " for tokenized_input in tokenized_inputs['input_ids']:\n", 1321 | " all_token_ids.extend(tokenized_input + [self.concat_token_id])\n", 1322 | " \n", 1323 | " for i in range(0, len(all_token_ids), self.seq_length):\n", 1324 | " input_ids = all_token_ids[i : i + self.seq_length]\n", 1325 | " if len(input_ids) == self.seq_length:\n", 1326 | " yield torch.tensor(input_ids)" 1327 | ] 1328 | }, 1329 | { 1330 | "cell_type": "code", 1331 | "execution_count": null, 1332 | "metadata": {}, 1333 | "outputs": [ 1334 | { 1335 | "name": "stdout", 1336 | "output_type": "stream", 1337 | "text": [ 1338 | "Fill buffer: 0<36864\n", 1339 | "Fill buffer: 3311<36864\n", 1340 | "Fill buffer: 9590<36864\n", 1341 | "Fill buffer: 22177<36864\n", 1342 | "Fill buffer: 25530<36864\n", 1343 | "Fill buffer: 31098<36864\n", 1344 | "Fill buffer: 32232<36864\n", 1345 | "Fill buffer: 33867<36864\n", 1346 | "Buffer full: 41172>=36864\n", 1347 | "Lengths of the sequences: [1024, 1024, 1024, 1024, 1024]\n" 1348 | ] 1349 | } 1350 | ], 1351 | "source": [ 1352 | "shuffled_dataset = dataset.shuffle(buffer_size=100)\n", 1353 | "constant_length_dataset = ConstantLengthDataset(tokenizer, shuffled_dataset,\n", 1354 | " num_of_sequences=10)\n", 1355 | "dataset_iterator = iter(constant_length_dataset)\n", 1356 | "\n", 1357 | "lengths = [len(b) for _, b in zip(range(5), dataset_iterator)]\n", 1358 | "print(f\"Lengths of the sequences: {lengths}\")" 1359 | ] 1360 | }, 1361 | { 1362 | "cell_type": "markdown", 1363 | "metadata": {}, 1364 | "source": [ 1365 | "### Defining the Training Loop" 1366 | ] 1367 | }, 1368 | { 1369 | "cell_type": "code", 1370 | "execution_count": null, 1371 | "metadata": {}, 1372 | "outputs": [], 1373 | "source": [ 1374 | "from argparse import Namespace\n", 1375 | "\n", 1376 | "# Commented parameters correspond to the small model\n", 1377 | "config = {\"train_batch_size\": 2, # 12\n", 1378 | " \"valid_batch_size\": 2, # 12\n", 1379 | " \"weight_decay\": 0.1,\n", 1380 | " \"shuffle_buffer\": 1000,\n", 1381 | " \"learning_rate\": 2e-4, # 5e-4\n", 1382 | " \"lr_scheduler_type\": \"cosine\",\n", 1383 | " \"num_warmup_steps\": 750, # 2000\n", 1384 | " \"gradient_accumulation_steps\": 16, # 1\n", 1385 | " \"max_train_steps\": 50000, # 150000\n", 1386 | " \"max_eval_steps\": -1,\n", 1387 | " \"seq_length\": 1024,\n", 1388 | " \"seed\": 1,\n", 1389 | " \"save_checkpoint_steps\": 50000} # 15000\n", 1390 | "\n", 1391 | "args = Namespace(**config)" 1392 | ] 1393 | }, 1394 | { 1395 | "cell_type": "code", 1396 | "execution_count": null, 1397 | "metadata": {}, 1398 | "outputs": [], 1399 | "source": [ 1400 | "from torch.utils.tensorboard import SummaryWriter\n", 1401 | "import logging\n", 1402 | "import wandb\n", 1403 | "\n", 1404 | "def setup_logging(project_name):\n", 1405 | " logger = logging.getLogger(__name__)\n", 1406 | " logging.basicConfig(\n", 1407 | " format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n", 1408 | " datefmt=\"%m/%d/%Y %H:%M:%S\", level=logging.INFO, handlers=[\n", 1409 | " logging.FileHandler(f\"log/debug_{accelerator.process_index}.log\"),\n", 1410 | " logging.StreamHandler()])\n", 1411 | " if accelerator.is_main_process: # We only want to set up logging once\n", 1412 | " wandb.init(project=project_name, config=args)\n", 1413 | " run_name = wandb.run.name\n", 1414 | " tb_writer = SummaryWriter()\n", 1415 | " tb_writer.add_hparams(vars(args), {'0': 0})\n", 1416 | " logger.setLevel(logging.INFO)\n", 1417 | " datasets.utils.logging.set_verbosity_debug()\n", 1418 | " transformers.utils.logging.set_verbosity_info()\n", 1419 | " else:\n", 1420 | " tb_writer = None\n", 1421 | " run_name = ''\n", 1422 | " logger.setLevel(logging.ERROR)\n", 1423 | " datasets.utils.logging.set_verbosity_error()\n", 1424 | " transformers.utils.logging.set_verbosity_error()\n", 1425 | " return logger, tb_writer, run_name" 1426 | ] 1427 | }, 1428 | { 1429 | "cell_type": "code", 1430 | "execution_count": null, 1431 | "metadata": {}, 1432 | "outputs": [], 1433 | "source": [ 1434 | "def log_metrics(step, metrics):\n", 1435 | " logger.info(f\"Step {step}: {metrics}\")\n", 1436 | " if accelerator.is_main_process:\n", 1437 | " wandb.log(metrics)\n", 1438 | " [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]" 1439 | ] 1440 | }, 1441 | { 1442 | "cell_type": "code", 1443 | "execution_count": null, 1444 | "metadata": {}, 1445 | "outputs": [ 1446 | { 1447 | "data": { 1448 | "application/vnd.jupyter.widget-view+json": { 1449 | "model_id": "328dc6d7d05c452e8d8e2cab5b4b9c4e", 1450 | "version_major": 2, 1451 | "version_minor": 0 1452 | }, 1453 | "text/plain": [ 1454 | "Check remote data files: 0%| | 0/183 [00:00 0 and step >= args.max_eval_steps: break\n", 1523 | " loss = torch.mean(torch.cat(losses))\n", 1524 | " try:\n", 1525 | "\t\tperplexity = torch.exp(loss)\n", 1526 | "\texcept OverflowError:\n", 1527 | "\t\tperplexity = torch.tensor(float(\"inf\"))\n", 1528 | " return loss.item(), perplexity.item()" 1529 | ] 1530 | }, 1531 | { 1532 | "cell_type": "code", 1533 | "execution_count": null, 1534 | "metadata": {}, 1535 | "outputs": [], 1536 | "source": [ 1537 | "set_seed(args.seed)\n", 1538 | "\n", 1539 | "# Accelerator\n", 1540 | "accelerator = Accelerator()\n", 1541 | "samples_per_step = accelerator.state.num_processes * args.train_batch_size\n", 1542 | "\n", 1543 | "# Logging\n", 1544 | "logger, tb_writer, run_name = setup_logging(project_name.split(\"/\")[1])\n", 1545 | "logger.info(accelerator.state)\n", 1546 | "\n", 1547 | "# Load model and tokenizer\n", 1548 | "if accelerator.is_main_process:\n", 1549 | " hf_repo = Repository(\"./\", clone_from=project_name, revision=run_name)\n", 1550 | "model = AutoModelForCausalLM.from_pretrained(\"./\", gradient_checkpointing=True)\n", 1551 | "tokenizer = AutoTokenizer.from_pretrained(\"./\")\n", 1552 | "\n", 1553 | "# Load dataset and dataloader\n", 1554 | "train_dataloader, eval_dataloader = create_dataloaders(dataset_name)\n", 1555 | "\n", 1556 | "# Prepare the optimizer and learning rate scheduler\n", 1557 | "optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)\n", 1558 | "lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer,\n", 1559 | " num_warmup_steps=args.num_warmup_steps,\n", 1560 | " num_training_steps=args.max_train_steps,)\n", 1561 | "def get_lr():\n", 1562 | " return optimizer.param_groups[0]['lr']\n", 1563 | "\n", 1564 | "# Prepare everything with our `accelerator` (order of args is not important)\n", 1565 | "model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(\n", 1566 | " model, optimizer, train_dataloader, eval_dataloader)\n", 1567 | "\n", 1568 | "# Train model\n", 1569 | "model.train()\n", 1570 | "completed_steps = 0\n", 1571 | "for step, batch in enumerate(train_dataloader, start=1):\n", 1572 | " loss = model(batch, labels=batch).loss\n", 1573 | " log_metrics(step, {'lr': get_lr(), 'samples': step*samples_per_step,\n", 1574 | " 'steps': completed_steps, 'loss/train': loss.item()})\n", 1575 | " loss = loss / args.gradient_accumulation_steps\n", 1576 | " accelerator.backward(loss)\n", 1577 | " if step % args.gradient_accumulation_steps == 0:\n", 1578 | " optimizer.step()\n", 1579 | " lr_scheduler.step()\n", 1580 | " optimizer.zero_grad()\n", 1581 | " completed_steps += 1\n", 1582 | " if step % args.save_checkpoint_steps == 0:\n", 1583 | " logger.info('Evaluating and saving model checkpoint')\n", 1584 | " eval_loss, perplexity = evaluate()\n", 1585 | " log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})\n", 1586 | " accelerator.wait_for_everyone()\n", 1587 | " unwrapped_model = accelerator.unwrap_model(model)\n", 1588 | " if accelerator.is_main_process:\n", 1589 | " unwrapped_model.save_pretrained(\"./\")\n", 1590 | " hf_repo.push_to_hub(commit_message=f'step {step}')\n", 1591 | " model.train()\n", 1592 | " if completed_steps >= args.max_train_steps:\n", 1593 | " break\n", 1594 | "\n", 1595 | "# Evaluate and save the last checkpoint\n", 1596 | "logger.info('Evaluating and saving model after training')\n", 1597 | "eval_loss, perplexity = evaluate()\n", 1598 | "log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})\n", 1599 | "accelerator.wait_for_everyone()\n", 1600 | "unwrapped_model = accelerator.unwrap_model(model)\n", 1601 | "if accelerator.is_main_process:\n", 1602 | " unwrapped_model.save_pretrained(\"./\")\n", 1603 | " hf_repo.push_to_hub(commit_message=f'final model')" 1604 | ] 1605 | }, 1606 | { 1607 | "cell_type": "markdown", 1608 | "metadata": {}, 1609 | "source": [ 1610 | "\"DDP\"" 1611 | ] 1612 | }, 1613 | { 1614 | "cell_type": "markdown", 1615 | "metadata": {}, 1616 | "source": [ 1617 | "### The Training Run" 1618 | ] 1619 | }, 1620 | { 1621 | "cell_type": "markdown", 1622 | "metadata": {}, 1623 | "source": [ 1624 | "## Results and Analysis" 1625 | ] 1626 | }, 1627 | { 1628 | "cell_type": "code", 1629 | "execution_count": null, 1630 | "metadata": {}, 1631 | "outputs": [ 1632 | { 1633 | "name": "stderr", 1634 | "output_type": "stream", 1635 | "text": [ 1636 | "2021-10-20 18:29:01.107727: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n", 1637 | "2021-10-20 18:29:01.107759: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n" 1638 | ] 1639 | } 1640 | ], 1641 | "source": [ 1642 | "#hide_output\n", 1643 | "from transformers import pipeline, set_seed\n", 1644 | "\n", 1645 | "model_ckpt = 'transformersbook/codeparrot-small'\n", 1646 | "generation = pipeline('text-generation', model=model_ckpt, device=0)" 1647 | ] 1648 | }, 1649 | { 1650 | "cell_type": "code", 1651 | "execution_count": null, 1652 | "metadata": {}, 1653 | "outputs": [], 1654 | "source": [ 1655 | "import re\n", 1656 | "from transformers import set_seed \n", 1657 | "\n", 1658 | "def first_block(string):\n", 1659 | " return re.split('\\nclass|\\ndef|\\n#|\\n@|\\nprint|\\nif', string)[0].rstrip()\n", 1660 | "\n", 1661 | "def complete_code(pipe, prompt, max_length=64, num_completions=4, seed=1):\n", 1662 | " set_seed(seed)\n", 1663 | " gen_kwargs = {\"temperature\":0.4, \"top_p\":0.95, \"top_k\":0, \"num_beams\":1,\n", 1664 | " \"do_sample\":True,}\n", 1665 | " code_gens = generation(prompt, num_return_sequences=num_completions, \n", 1666 | " max_length=max_length, **gen_kwargs)\n", 1667 | " code_strings = []\n", 1668 | " for code_gen in code_gens:\n", 1669 | " generated_code = first_block(code_gen['generated_text'][len(prompt):])\n", 1670 | " code_strings.append(generated_code)\n", 1671 | " print(('\\n'+'='*80 + '\\n').join(code_strings))" 1672 | ] 1673 | }, 1674 | { 1675 | "cell_type": "code", 1676 | "execution_count": null, 1677 | "metadata": {}, 1678 | "outputs": [ 1679 | { 1680 | "name": "stdout", 1681 | "output_type": "stream", 1682 | "text": [ 1683 | "\n", 1684 | " return math.sqrt(a * b)\n", 1685 | "================================================================================\n", 1686 | "\n", 1687 | " return a * b / 2.0\n", 1688 | "================================================================================\n", 1689 | "\n", 1690 | " return a * b\n", 1691 | "================================================================================\n", 1692 | "\n", 1693 | " return a * b / a\n" 1694 | ] 1695 | } 1696 | ], 1697 | "source": [ 1698 | "prompt = '''def area_of_rectangle(a: float, b: float):\n", 1699 | " \"\"\"Return the area of the rectangle.\"\"\"'''\n", 1700 | "complete_code(generation, prompt)" 1701 | ] 1702 | }, 1703 | { 1704 | "cell_type": "code", 1705 | "execution_count": null, 1706 | "metadata": {}, 1707 | "outputs": [ 1708 | { 1709 | "name": "stdout", 1710 | "output_type": "stream", 1711 | "text": [ 1712 | "\n", 1713 | " if not html:\n", 1714 | " return []\n", 1715 | " return [url for url in re.findall(r'', html)]\n", 1716 | "================================================================================\n", 1717 | "\n", 1718 | " return [url for url in re.findall(r']*>', html)\n" 1726 | ] 1727 | } 1728 | ], 1729 | "source": [ 1730 | "prompt = '''def get_urls_from_html(html):\n", 1731 | " \"\"\"Get all embedded URLs in a HTML string.\"\"\"'''\n", 1732 | "complete_code(generation, prompt)" 1733 | ] 1734 | }, 1735 | { 1736 | "cell_type": "code", 1737 | "execution_count": null, 1738 | "metadata": {}, 1739 | "outputs": [ 1740 | { 1741 | "name": "stdout", 1742 | "output_type": "stream", 1743 | "text": [ 1744 | "https://github.com/huggingface/transformers | /allenai | /facebook |\n", 1745 | "/asteroid-team | /google | /amazon | /speechbrain | /microsoft | /grammarly |\n", 1746 | "/models | /inference-api | /distilbert-base-uncased |\n", 1747 | "/dbmdz/bert-large-cased-finetuned-conll03-english |\n", 1748 | "https://huggingface.co/transformers | https://arxiv.org/abs/1811.06031 |\n", 1749 | "https://arxiv.org/abs/1803.10631 | https://transformer.huggingface.co/ | /coref\n", 1750 | "| https://medium.com/huggingface/distilbert-8cf3380435b5\n" 1751 | ] 1752 | } 1753 | ], 1754 | "source": [ 1755 | "import requests\n", 1756 | "\n", 1757 | "def get_urls_from_html(html):\n", 1758 | " return [url for url in re.findall(r' **NOTE**: In the following code block, a large GPT-2 checkpoint is loaded into memory. On platforms like Colab and Kaggle, this can cause the instance to crash due to insufficient RAM or GPU memory. You can still run the example if you replace the large model with the small one by using `model_ckpt = \"transformersbook/codeparrot-small\"`.\n", 1768 | " " 1769 | ] 1770 | }, 1771 | { 1772 | "cell_type": "code", 1773 | "execution_count": null, 1774 | "metadata": {}, 1775 | "outputs": [ 1776 | { 1777 | "name": "stderr", 1778 | "output_type": "stream", 1779 | "text": [ 1780 | "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n" 1781 | ] 1782 | }, 1783 | { 1784 | "name": "stdout", 1785 | "output_type": "stream", 1786 | "text": [ 1787 | "\n", 1788 | " return np.mean(a)\n", 1789 | "================================================================================\n", 1790 | "\n", 1791 | " return np.mean(a)\n", 1792 | "================================================================================\n", 1793 | "\n", 1794 | " return np.mean(a)\n", 1795 | "================================================================================\n", 1796 | "\n", 1797 | " return np.mean(a)\n" 1798 | ] 1799 | } 1800 | ], 1801 | "source": [ 1802 | "model_ckpt = 'transformersbook/codeparrot'\n", 1803 | "generation = pipeline('text-generation', model=model_ckpt, device=0)\n", 1804 | "\n", 1805 | "prompt = '''# a function in native python:\n", 1806 | "def mean(a):\n", 1807 | " return sum(a)/len(a)\n", 1808 | "\n", 1809 | "# the same function using numpy:\n", 1810 | "import numpy as np\n", 1811 | "def mean(a):'''\n", 1812 | "complete_code(generation, prompt, max_length=64)" 1813 | ] 1814 | }, 1815 | { 1816 | "cell_type": "code", 1817 | "execution_count": null, 1818 | "metadata": {}, 1819 | "outputs": [ 1820 | { 1821 | "name": "stderr", 1822 | "output_type": "stream", 1823 | "text": [ 1824 | "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n" 1825 | ] 1826 | }, 1827 | { 1828 | "name": "stdout", 1829 | "output_type": "stream", 1830 | "text": [ 1831 | "\n", 1832 | "reg = DummyRegressor()\n", 1833 | "\n", 1834 | "forest = RandomForestClassifier(n_estimators=20)\n", 1835 | "\n", 1836 | "forest.fit(X, y)\n", 1837 | "================================================================================\n", 1838 | "\n", 1839 | "clf = ExtraTreesClassifier(n_estimators=100, max_features='sqrt')\n", 1840 | "clf.fit(X, y)\n", 1841 | "================================================================================\n", 1842 | "\n", 1843 | "clf = RandomForestClassifier(n_estimators=20, n_jobs=n_jobs, random_state=1)\n", 1844 | "clf.fit(X, y)\n", 1845 | "================================================================================\n", 1846 | "\n", 1847 | "clf = RandomForestClassifier(n_estimators=20)\n", 1848 | "clf.fit(X, y)\n" 1849 | ] 1850 | } 1851 | ], 1852 | "source": [ 1853 | "prompt = '''X = np.random.randn(100, 100)\n", 1854 | "y = np.random.randint(0, 1, 100)\n", 1855 | "\n", 1856 | "# fit random forest classifier with 20 estimators'''\n", 1857 | "complete_code(generation, prompt, max_length=96)" 1858 | ] 1859 | }, 1860 | { 1861 | "cell_type": "markdown", 1862 | "metadata": {}, 1863 | "source": [ 1864 | "## Conclusion" 1865 | ] 1866 | }, 1867 | { 1868 | "cell_type": "code", 1869 | "execution_count": null, 1870 | "metadata": {}, 1871 | "outputs": [], 1872 | "source": [] 1873 | } 1874 | ], 1875 | "metadata": { 1876 | "kernelspec": { 1877 | "display_name": "Python 3 (ipykernel)", 1878 | "language": "python", 1879 | "name": "python3" 1880 | } 1881 | }, 1882 | "nbformat": 4, 1883 | "nbformat_minor": 4 1884 | } 1885 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Transformers Notebooks 2 | 3 | This repository contains the example code from our O'Reilly book [Natural Language Processing with Transformers](https://www.oreilly.com/library/view/natural-language-processing/9781098136789/): 4 | 5 | book-cover 6 | 7 | ## Getting started 8 | 9 | You can run these notebooks on cloud platforms like [Google Colab](https://colab.research.google.com/) or your local machine. Note that most chapters require a GPU to run in a reasonable amount of time, so we recommend one of the cloud platforms as they come pre-installed with CUDA. 10 | 11 | ### Running on a cloud platform 12 | 13 | To run these notebooks on a cloud platform, just click on one of the badges in the table below: 14 | 15 | 16 | 17 | 18 | 19 | | Chapter | Colab | Kaggle | Gradient | Studio Lab | 20 | |:--------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 21 | | Introduction | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/01_introduction.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/01_introduction.ipynb) | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/01_introduction.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/01_introduction.ipynb) | 22 | | Text Classification | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/02_classification.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/02_classification.ipynb) | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/02_classification.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/02_classification.ipynb) | 23 | | Transformer Anatomy | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/03_transformer-anatomy.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/03_transformer-anatomy.ipynb) | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/03_transformer-anatomy.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/03_transformer-anatomy.ipynb) | 24 | | Multilingual Named Entity Recognition | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/04_multilingual-ner.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/04_multilingual-ner.ipynb) | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/04_multilingual-ner.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/04_multilingual-ner.ipynb) | 25 | | Text Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/05_text-generation.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/05_text-generation.ipynb) | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/05_text-generation.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/05_text-generation.ipynb) | 26 | | Summarization | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/06_summarization.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/06_summarization.ipynb) | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/06_summarization.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/06_summarization.ipynb) | 27 | | Question Answering | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/07_question-answering.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/07_question-answering.ipynb) | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/07_question-answering.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/07_question-answering.ipynb) | 28 | | Making Transformers Efficient in Production | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/08_model-compression.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/08_model-compression.ipynb) | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/08_model-compression.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/08_model-compression.ipynb) | 29 | | Dealing with Few to No Labels | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/09_few-to-no-labels.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/09_few-to-no-labels.ipynb) | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/09_few-to-no-labels.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/09_few-to-no-labels.ipynb) | 30 | | Training Transformers from Scratch | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/10_transformers-from-scratch.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/10_transformers-from-scratch.ipynb) | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/10_transformers-from-scratch.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/10_transformers-from-scratch.ipynb) | 31 | | Future Directions | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nlp-with-transformers/notebooks/blob/main/11_future-directions.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/nlp-with-transformers/notebooks/blob/main/11_future-directions.ipynb) | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/nlp-with-transformers/notebooks/blob/main/11_future-directions.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/nlp-with-transformers/notebooks/blob/main/11_future-directions.ipynb) | 32 | 33 | 34 | 35 | Nowadays, the GPUs on Colab tend to be K80s (which have limited memory), so we recommend using [Kaggle](https://www.kaggle.com/docs/notebooks), [Gradient](https://gradient.run/notebooks), or [SageMaker Studio Lab](https://studiolab.sagemaker.aws/). These platforms tend to provide more performant GPUs like P100s, all for free! 36 | 37 | > Note: some cloud platforms like Kaggle require you to restart the notebook after installing new packages. 38 | 39 | ### Running on your machine 40 | 41 | To run the notebooks on your own machine, first clone the repository and navigate to it: 42 | 43 | ```bash 44 | $ git clone https://github.com/nlp-with-transformers/notebooks.git 45 | $ cd notebooks 46 | ``` 47 | 48 | Next, run the following command to create a `conda` virtual environment that contains all the libraries needed to run the notebooks: 49 | 50 | ```bash 51 | $ conda env create -f environment.yml 52 | ``` 53 | 54 | > Note: You'll need a GPU that supports NVIDIA's [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) to build the environment. Currently, this means you cannot build locally on Apple silicon 😢. 55 | 56 | Chapter 7 (Question Answering) has a special set of dependencies, so to run that chapter you'll need a separate environment: 57 | 58 | ```bash 59 | $ conda env create -f environment-chapter7.yml 60 | ``` 61 | 62 | Once you've installed the dependencies, you can activate the `conda` environment and spin up the notebooks as follows: 63 | 64 | ```bash 65 | $ conda activate book # or conda activate book-chapter7 66 | $ jupyter notebook 67 | ``` 68 | 69 | ## FAQ 70 | 71 | ### When trying to clone the notebooks on Kaggle I get a message that I am unable to access the book's Github repository. How can I solve this issue? 72 | 73 | This issue is likely due to a missing internet connection. When running your first notebook on Kaggle you need to enable internet access in the settings menu on the right side. 74 | 75 | ### How do you select a GPU on Kaggle? 76 | 77 | You can enable GPU usage by selecting *GPU* as *Accelerator* in the settings menu on the right side. 78 | 79 | ## Citations 80 | 81 | If you'd like to cite this book, you can use the following BibTeX entry: 82 | 83 | ``` 84 | @book{tunstall2022natural, 85 | title={Natural Language Processing with Transformers: Building Language Applications with Hugging Face}, 86 | author={Tunstall, Lewis and von Werra, Leandro and Wolf, Thomas}, 87 | isbn={1098103246}, 88 | url={https://books.google.ch/books?id=7hhyzgEACAAJ}, 89 | year={2022}, 90 | publisher={O'Reilly Media, Incorporated} 91 | } 92 | ``` 93 | -------------------------------------------------------------------------------- /SageMaker/01_introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%%capture\n", 10 | "%pip install datasets transformers[tf,torch,sentencepiece,vision,optuna,sklearn,onnxruntime]==4.11.3" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "#hide\n", 20 | "from utils import *\n", 21 | "setup_chapter()" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "In this book we will demonstrate how you can run the example from the book in Amazon SageMaker. \n", 29 | "\n", 30 | "The SageMaker notebook uses an AWS IAM role to access AWS resources such as Amazon S3 bucket.\n", 31 | "You created this role during the notebook creation process described in the README.md in SageMaker/README.md.\n", 32 | "In the AWS IAM service you are able to review the access policy and you can modify it.\n", 33 | "\n", 34 | "In the next cell we will check an Amazon S3 bucket exists and create a new one if not. In addition we'll get the SageMaker role and session." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import sagemaker.huggingface\n", 44 | "import sagemaker\n", 45 | "\n", 46 | "sess = sagemaker.Session()\n", 47 | "# sagemaker session bucket -> used for uploading data, models and logs\n", 48 | "# sagemaker will automatically create this bucket if it not exists\n", 49 | "sagemaker_session_bucket=None\n", 50 | "if sagemaker_session_bucket is None and sess is not None:\n", 51 | " # set to default bucket if a bucket name is not given\n", 52 | " sagemaker_session_bucket = sess.default_bucket()\n", 53 | "\n", 54 | "role = sagemaker.get_execution_role()\n", 55 | "sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n", 56 | "\n", 57 | "print(f\"sagemaker role arn: {role}\")\n", 58 | "print(f\"sagemaker bucket: {sess.default_bucket()}\")\n", 59 | "print(f\"sagemaker session region: {sess.boto_region_name}\")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "Now we setup a helper function to easily deploy any Hugging Face model as an endpoint on AWS SageMaker.\n", 67 | "We use the following function to create a HuggingFaceModel Class, where we are going to download the model from the Hugging Face hub. This class would also allow to use a trained model stored on the Amazon S3 bucket. \n", 68 | "Next, an endpoint will be created and this endpoint will host your model. Based on the model requirements you can choose a specific instance type which are equipped differently in memory, cpu, gpu. There are different inferences available, such as real-time, asynchronous or serverless.\n", 69 | "If you are not sure which inference works best for your model, you use Amazon SageMaker Inference Recommender.\n", 70 | "https://docs.aws.amazon.com/sagemaker/latest/dg/deploy-model.html \n", 71 | "\n", 72 | "To view all options see the documentation: https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/index.html \n", 73 | "\n", 74 | "Depending on Transformer version, PyTorch/TensorFlow version and Python version, the mapping for the Hugging Face Model Class can be found here: https://huggingface.co/docs/sagemaker/reference#inference-dlc-overview \n", 75 | "\n", 76 | "To find the endpoints in the AWS Console navigate to https://console.aws.amazon.com/sagemaker/home#/endpoints \n", 77 | "\n", 78 | "Make sure to finish this notebook to delete the endpoint in the end. " 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "from sagemaker.huggingface.model import HuggingFaceModel\n", 88 | "\n", 89 | "def setup_endpoint(model_name, task_name):\n", 90 | " # Hub Model configuration. \n", 91 | " hub = {\n", 92 | " 'HF_MODEL_ID': model_name, # model_id from hf.co/models\n", 93 | " 'HF_TASK': task_name # NLP task you want to use for predictions\n", 94 | " }\n", 95 | "\n", 96 | " # create Hugging Face Model Class\n", 97 | " huggingface_model = HuggingFaceModel(\n", 98 | " env=hub, # configuration for loading model from Hub\n", 99 | " role=role, # iam role with permissions to create an Endpoint\n", 100 | " transformers_version=\"4.17.0\", # transformers version used\n", 101 | " pytorch_version=\"1.10.2\", # pytorch version used\n", 102 | " py_version=\"py38\" # python version used\n", 103 | " )\n", 104 | "\n", 105 | " # deploy model to SageMaker Inference\n", 106 | " predictor = huggingface_model.deploy(\n", 107 | " initial_instance_count=1, # how many instances used\n", 108 | " instance_type=\"ml.m5.xlarge\" # instance type\n", 109 | " )\n", 110 | " return predictor" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "# Hello Transformers" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "## The Encoder-Decoder Framework" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "## Attention Mechanisms" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "## Transfer Learning in NLP" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "## Hugging Face Transformers: Bridging the Gap" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "## A Tour of Transformer Applications" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "text = \"\"\"Dear Amazon, last week I ordered an Optimus Prime action figure \\\n", 162 | "from your online store in Germany. Unfortunately, when I opened the package, \\\n", 163 | "I discovered to my horror that I had been sent an action figure of Megatron \\\n", 164 | "instead! As a lifelong enemy of the Decepticons, I hope you can understand my \\\n", 165 | "dilemma. To resolve the issue, I demand an exchange of Megatron for the \\\n", 166 | "Optimus Prime figure I ordered. Enclosed are copies of my records concerning \\\n", 167 | "this purchase. I expect to hear from you soon. Sincerely, Bumblebee.\"\"\"" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "### Text Classification" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "predictor = setup_endpoint('distilbert-base-uncased-finetuned-sst-2-english', 'text-classification')" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "# example request, you always need to define \"inputs\"\n", 193 | "import pandas as pd\n", 194 | "\n", 195 | "# request\n", 196 | "outputs = predictor.predict({\"inputs\": text})\n", 197 | "pd.DataFrame(outputs) " 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "predictor.delete_endpoint()" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "### Named Entity Recognition" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "predictor = setup_endpoint(\"dbmdz/bert-large-cased-finetuned-conll03-english\", \"ner\")" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "outputs = predictor.predict({\"inputs\": text, \"parameters\": {\"aggregation_strategy\": \"simple\"}})\n", 232 | "pd.DataFrame(outputs) " 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "predictor.delete_endpoint()" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "### Question Answering " 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "predictor = setup_endpoint(\"distilbert-base-cased-distilled-squad\", 'question-answering')" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "question = \"What does the customer want?\"\n", 267 | "\n", 268 | "outputs = predictor.predict({\"inputs\": {\n", 269 | " \"question\": question,\n", 270 | " \"context\": text\n", 271 | " }\n", 272 | "})\n", 273 | "\n", 274 | "pd.DataFrame([outputs]) " 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "predictor.delete_endpoint()" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "### Summarization" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "predictor = setup_endpoint(\"sshleifer/distilbart-cnn-12-6\", 'summarization')" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "outputs = predictor.predict({\"inputs\": text,\n", 309 | " \"parameters\": {\n", 310 | " \"max_length\":45,\n", 311 | " \"clean_up_tokenization_spaces\":True\n", 312 | " }\n", 313 | " })\n", 314 | "print(outputs[0]['summary_text'])" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "predictor.delete_endpoint()" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "### Translation" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "predictor = setup_endpoint(\"Helsinki-NLP/opus-mt-en-de\", \"translation\")" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "outputs = predictor.predict({\"inputs\": text,\n", 349 | " \"parameters\": {\n", 350 | " \"min_length\":100,\n", 351 | " \"clean_up_tokenization_spaces\":True\n", 352 | " }\n", 353 | " })\n", 354 | "print(outputs[0]['translation_text'])" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "predictor.delete_endpoint()" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "### Text Generation" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "predictor = setup_endpoint(\"gpt2\", 'text-generation')" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "response = \"Dear Bumblebee, I am sorry to hear that your order was mixed up.\"\n", 389 | "prompt = text + \"\\n\\nCustomer service response:\\n\" + response\n", 390 | "\n", 391 | "outputs = predictor.predict({\"inputs\": prompt,\n", 392 | " \"parameters\": {\n", 393 | " \"max_length\":200\n", 394 | " }\n", 395 | " })\n", 396 | "print(outputs[0]['generated_text'])" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "predictor.delete_endpoint()" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "## The Hugging Face Ecosystem" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "### The Hugging Face Hub" 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "### Hugging Face Tokenizers" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": {}, 432 | "source": [ 433 | "### Hugging Face Datasets" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "### Hugging Face Accelerate" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "## Main Challenges with Transformers" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "## Conclusion" 455 | ] 456 | } 457 | ], 458 | "metadata": { 459 | "kernelspec": { 460 | "display_name": "Python 3.9.13 64-bit", 461 | "language": "python", 462 | "name": "python3" 463 | }, 464 | "language_info": { 465 | "codemirror_mode": { 466 | "name": "ipython", 467 | "version": 3 468 | }, 469 | "file_extension": ".py", 470 | "mimetype": "text/x-python", 471 | "name": "python", 472 | "nbconvert_exporter": "python", 473 | "pygments_lexer": "ipython3", 474 | "version": "3.9.13" 475 | }, 476 | "vscode": { 477 | "interpreter": { 478 | "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" 479 | } 480 | } 481 | }, 482 | "nbformat": 4, 483 | "nbformat_minor": 4 484 | } 485 | -------------------------------------------------------------------------------- /SageMaker/02_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%%capture\n", 12 | "%pip install datasets[audio]==1.16.1 umap-learn==0.5.1 datasets[s3] transformers[tf,torch,sentencepiece,vision,optuna,sklearn,onnxruntime]==4.11.3" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "#hide\n", 22 | "from utils import *\n", 23 | "setup_chapter()" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "# Text Classification" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## The Dataset" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### A First Look at Hugging Face Datasets" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "from datasets import list_datasets\n", 54 | "\n", 55 | "all_datasets = list_datasets()\n", 56 | "print(f\"There are {len(all_datasets)} datasets currently available on the Hub\")\n", 57 | "print(f\"The first 10 are: {all_datasets[:10]}\")" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# hide_output\n", 67 | "from datasets import load_dataset\n", 68 | "\n", 69 | "emotions = load_dataset(\"emotion\")" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "emotions" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "train_ds = emotions[\"train\"]\n", 88 | "train_ds" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "len(train_ds)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "train_ds[0]" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "train_ds.column_names" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "print(train_ds.features)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "print(train_ds[:5])" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "print(train_ds[\"text\"][:5])" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "### From Datasets to DataFrames" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "import pandas as pd\n", 159 | "\n", 160 | "emotions.set_format(type=\"pandas\")\n", 161 | "df = emotions[\"train\"][:]\n", 162 | "df.head()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "def label_int2str(row):\n", 172 | " return emotions[\"train\"].features[\"label\"].int2str(row)\n", 173 | "\n", 174 | "df[\"label_name\"] = df[\"label\"].apply(label_int2str)\n", 175 | "df.head()" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "### Looking at the Class Distribution" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "import matplotlib.pyplot as plt\n", 192 | "\n", 193 | "df[\"label_name\"].value_counts(ascending=True).plot.barh()\n", 194 | "plt.title(\"Frequency of Classes\")\n", 195 | "plt.show()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "### How Long Are Our Tweets?" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "df[\"Words Per Tweet\"] = df[\"text\"].str.split().apply(len)\n", 212 | "df.boxplot(\"Words Per Tweet\", by=\"label_name\", grid=False, showfliers=False,\n", 213 | " color=\"black\")\n", 214 | "plt.suptitle(\"\")\n", 215 | "plt.xlabel(\"\")\n", 216 | "plt.show()" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "emotions.reset_format()" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "## From Text to Tokens" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### Character Tokenization" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "text = \"Tokenizing text is a core task of NLP.\"\n", 249 | "tokenized_text = list(text)\n", 250 | "print(tokenized_text)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_text)))}\n", 260 | "print(token2idx)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "input_ids = [token2idx[token] for token in tokenized_text]\n", 270 | "print(input_ids)" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "categorical_df = pd.DataFrame(\n", 280 | " {\"Name\": [\"Bumblebee\", \"Optimus Prime\", \"Megatron\"], \"Label ID\": [0,1,2]})\n", 281 | "categorical_df" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "pd.get_dummies(categorical_df[\"Name\"])" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "import torch\n", 300 | "import torch.nn.functional as F\n", 301 | "\n", 302 | "input_ids = torch.tensor(input_ids)\n", 303 | "one_hot_encodings = F.one_hot(input_ids, num_classes=len(token2idx))\n", 304 | "one_hot_encodings.shape" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "print(f\"Token: {tokenized_text[0]}\")\n", 314 | "print(f\"Tensor index: {input_ids[0]}\")\n", 315 | "print(f\"One-hot: {one_hot_encodings[0]}\")" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "### Word Tokenization" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "tokenized_text = text.split()\n", 332 | "print(tokenized_text)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "### Subword Tokenization" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "# hide_output\n", 349 | "from transformers import AutoTokenizer\n", 350 | "\n", 351 | "model_ckpt = \"distilbert-base-uncased\"\n", 352 | "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "encoded_text = tokenizer(text)\n", 362 | "print(encoded_text)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)\n", 372 | "print(tokens)" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [ 381 | "print(tokenizer.convert_tokens_to_string(tokens))" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "tokenizer.vocab_size" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "tokenizer.model_max_length" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "tokenizer.model_input_names" 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "metadata": {}, 414 | "source": [ 415 | "### Tokenizing the Whole Dataset" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "def tokenize(batch):\n", 425 | " return tokenizer(batch[\"text\"], padding=True, truncation=True)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "print(tokenize(emotions[\"train\"][:2]))" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "#hide_input\n", 444 | "tokens2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))\n", 445 | "data = sorted(tokens2ids, key=lambda x : x[-1])\n", 446 | "df = pd.DataFrame(data, columns=[\"Special Token\", \"Special Token ID\"])\n", 447 | "df.T" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "# hide_output\n", 457 | "emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "print(emotions_encoded[\"train\"].column_names)" 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "metadata": {}, 472 | "source": [ 473 | "## Training a Text Classifier" 474 | ] 475 | }, 476 | { 477 | "cell_type": "markdown", 478 | "metadata": {}, 479 | "source": [ 480 | "### Transformers as Feature Extractors" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "#### Using pretrained models" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "# hide_output\n", 497 | "from transformers import AutoModel\n", 498 | "\n", 499 | "model_ckpt = \"distilbert-base-uncased\"\n", 500 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 501 | "model = AutoModel.from_pretrained(model_ckpt).to(device)" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "#### Extracting the last hidden states" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "text = \"this is a test\"\n", 518 | "inputs = tokenizer(text, return_tensors=\"pt\")\n", 519 | "print(f\"Input tensor shape: {inputs['input_ids'].size()}\")" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "inputs = {k:v.to(device) for k,v in inputs.items()}\n", 529 | "with torch.no_grad():\n", 530 | " outputs = model(**inputs)\n", 531 | "print(outputs)" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "metadata": {}, 538 | "outputs": [], 539 | "source": [ 540 | "outputs.last_hidden_state.size()" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "outputs.last_hidden_state[:,0].size()" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [ 558 | "def extract_hidden_states(batch):\n", 559 | " # Place model inputs on the GPU\n", 560 | " inputs = {k:v.to(device) for k,v in batch.items() \n", 561 | " if k in tokenizer.model_input_names}\n", 562 | " # Extract last hidden states\n", 563 | " with torch.no_grad():\n", 564 | " last_hidden_state = model(**inputs).last_hidden_state\n", 565 | " # Return vector for [CLS] token\n", 566 | " return {\"hidden_state\": last_hidden_state[:,0].cpu().numpy()}" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": null, 572 | "metadata": {}, 573 | "outputs": [], 574 | "source": [ 575 | "emotions_encoded.set_format(\"torch\", \n", 576 | " columns=[\"input_ids\", \"attention_mask\", \"label\"])" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [ 585 | "#hide_output\n", 586 | "emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [ 595 | "emotions_hidden[\"train\"].column_names" 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "metadata": {}, 601 | "source": [ 602 | "#### Creating a feature matrix" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "metadata": {}, 609 | "outputs": [], 610 | "source": [ 611 | "import numpy as np\n", 612 | "\n", 613 | "X_train = np.array(emotions_hidden[\"train\"][\"hidden_state\"])\n", 614 | "X_valid = np.array(emotions_hidden[\"validation\"][\"hidden_state\"])\n", 615 | "y_train = np.array(emotions_hidden[\"train\"][\"label\"])\n", 616 | "y_valid = np.array(emotions_hidden[\"validation\"][\"label\"])\n", 617 | "X_train.shape, X_valid.shape" 618 | ] 619 | }, 620 | { 621 | "cell_type": "markdown", 622 | "metadata": {}, 623 | "source": [ 624 | "#### Visualizing the training set" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": null, 630 | "metadata": {}, 631 | "outputs": [], 632 | "source": [ 633 | "from umap import UMAP\n", 634 | "from sklearn.preprocessing import MinMaxScaler\n", 635 | "\n", 636 | "# Scale features to [0,1] range\n", 637 | "X_scaled = MinMaxScaler().fit_transform(X_train)\n", 638 | "# Initialize and fit UMAP\n", 639 | "mapper = UMAP(n_components=2, metric=\"cosine\").fit(X_scaled)\n", 640 | "# Create a DataFrame of 2D embeddings\n", 641 | "df_emb = pd.DataFrame(mapper.embedding_, columns=[\"X\", \"Y\"])\n", 642 | "df_emb[\"label\"] = y_train\n", 643 | "df_emb.head()" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": null, 649 | "metadata": {}, 650 | "outputs": [], 651 | "source": [ 652 | "fig, axes = plt.subplots(2, 3, figsize=(7,5))\n", 653 | "axes = axes.flatten()\n", 654 | "cmaps = [\"Greys\", \"Blues\", \"Oranges\", \"Reds\", \"Purples\", \"Greens\"]\n", 655 | "labels = emotions[\"train\"].features[\"label\"].names\n", 656 | "\n", 657 | "for i, (label, cmap) in enumerate(zip(labels, cmaps)):\n", 658 | " df_emb_sub = df_emb.query(f\"label == {i}\")\n", 659 | " axes[i].hexbin(df_emb_sub[\"X\"], df_emb_sub[\"Y\"], cmap=cmap,\n", 660 | " gridsize=20, linewidths=(0,))\n", 661 | " axes[i].set_title(label)\n", 662 | " axes[i].set_xticks([]), axes[i].set_yticks([])\n", 663 | "\n", 664 | "plt.tight_layout()\n", 665 | "plt.show()" 666 | ] 667 | }, 668 | { 669 | "cell_type": "markdown", 670 | "metadata": {}, 671 | "source": [ 672 | "#### Training a simple classifier\n" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": null, 678 | "metadata": {}, 679 | "outputs": [], 680 | "source": [ 681 | "#hide_output\n", 682 | "# We increase `max_iter` to guarantee convergence \n", 683 | "from sklearn.linear_model import LogisticRegression\n", 684 | "\n", 685 | "lr_clf = LogisticRegression(max_iter=3000)\n", 686 | "lr_clf.fit(X_train, y_train)" 687 | ] 688 | }, 689 | { 690 | "cell_type": "code", 691 | "execution_count": null, 692 | "metadata": {}, 693 | "outputs": [], 694 | "source": [ 695 | "lr_clf.score(X_valid, y_valid)" 696 | ] 697 | }, 698 | { 699 | "cell_type": "code", 700 | "execution_count": null, 701 | "metadata": {}, 702 | "outputs": [], 703 | "source": [ 704 | "from sklearn.dummy import DummyClassifier\n", 705 | "\n", 706 | "dummy_clf = DummyClassifier(strategy=\"most_frequent\")\n", 707 | "dummy_clf.fit(X_train, y_train)\n", 708 | "dummy_clf.score(X_valid, y_valid)" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": null, 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [ 717 | "from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix\n", 718 | "\n", 719 | "def plot_confusion_matrix(y_preds, y_true, labels):\n", 720 | " cm = confusion_matrix(y_true, y_preds, normalize=\"true\")\n", 721 | " fig, ax = plt.subplots(figsize=(6, 6))\n", 722 | " disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)\n", 723 | " disp.plot(cmap=\"Blues\", values_format=\".2f\", ax=ax, colorbar=False)\n", 724 | " plt.title(\"Normalized confusion matrix\")\n", 725 | " plt.show()\n", 726 | " \n", 727 | "y_preds = lr_clf.predict(X_valid)\n", 728 | "plot_confusion_matrix(y_preds, y_valid, labels)" 729 | ] 730 | }, 731 | { 732 | "cell_type": "markdown", 733 | "metadata": {}, 734 | "source": [ 735 | "### Fine-Tuning Transformers" 736 | ] 737 | }, 738 | { 739 | "cell_type": "markdown", 740 | "metadata": {}, 741 | "source": [ 742 | "#### Loading a pretrained model" 743 | ] 744 | }, 745 | { 746 | "cell_type": "code", 747 | "execution_count": null, 748 | "metadata": {}, 749 | "outputs": [], 750 | "source": [ 751 | "# hide_output\n", 752 | "from transformers import AutoModelForSequenceClassification\n", 753 | "\n", 754 | "num_labels = 6\n", 755 | "model = (AutoModelForSequenceClassification\n", 756 | " .from_pretrained(model_ckpt, num_labels=num_labels)\n", 757 | " .to(device))" 758 | ] 759 | }, 760 | { 761 | "cell_type": "markdown", 762 | "metadata": {}, 763 | "source": [ 764 | "#### Defining the performance metrics" 765 | ] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": null, 770 | "metadata": {}, 771 | "outputs": [], 772 | "source": [ 773 | "from sklearn.metrics import accuracy_score, f1_score\n", 774 | "\n", 775 | "def compute_metrics(pred):\n", 776 | " labels = pred.label_ids\n", 777 | " preds = pred.predictions.argmax(-1)\n", 778 | " f1 = f1_score(labels, preds, average=\"weighted\")\n", 779 | " acc = accuracy_score(labels, preds)\n", 780 | " return {\"accuracy\": acc, \"f1\": f1}" 781 | ] 782 | }, 783 | { 784 | "cell_type": "markdown", 785 | "metadata": {}, 786 | "source": [ 787 | "#### Training the model" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": null, 793 | "metadata": {}, 794 | "outputs": [], 795 | "source": [ 796 | "from huggingface_hub import notebook_login\n", 797 | "\n", 798 | "notebook_login()" 799 | ] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": null, 804 | "metadata": {}, 805 | "outputs": [], 806 | "source": [ 807 | "from huggingface_hub import HfFolder\n", 808 | "\n", 809 | "username = 'simonmesserli' #replace with your own username from hugging face.\n", 810 | "hub_token = HfFolder.get_token()" 811 | ] 812 | }, 813 | { 814 | "cell_type": "markdown", 815 | "metadata": {}, 816 | "source": [ 817 | "### Training with SageMaker" 818 | ] 819 | }, 820 | { 821 | "cell_type": "code", 822 | "execution_count": null, 823 | "metadata": {}, 824 | "outputs": [], 825 | "source": [ 826 | "import sagemaker.huggingface\n", 827 | "import sagemaker\n", 828 | "\n", 829 | "sess = sagemaker.Session()\n", 830 | "# sagemaker session bucket -> used for uploading data, models and logs\n", 831 | "# sagemaker will automatically create this bucket if it not exists\n", 832 | "sagemaker_session_bucket=None\n", 833 | "if sagemaker_session_bucket is None and sess is not None:\n", 834 | " # set to default bucket if a bucket name is not given\n", 835 | " sagemaker_session_bucket = sess.default_bucket()\n", 836 | "\n", 837 | "role = sagemaker.get_execution_role()\n", 838 | "sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n", 839 | "\n", 840 | "print(f\"sagemaker role arn: {role}\")\n", 841 | "print(f\"sagemaker bucket: {sess.default_bucket()}\")\n", 842 | "print(f\"sagemaker session region: {sess.boto_region_name}\")" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": null, 848 | "metadata": {}, 849 | "outputs": [], 850 | "source": [ 851 | "import botocore\n", 852 | "from datasets.filesystems import S3FileSystem\n", 853 | "\n", 854 | "s3 = S3FileSystem()\n", 855 | "\n", 856 | "s3_prefix = 'samples/datasets/02_classification'\n", 857 | "\n", 858 | "train_dataset=emotions_encoded[\"train\"]\n", 859 | "eval_dataset=emotions_encoded[\"validation\"]\n", 860 | "\n", 861 | "# save train_dataset to s3\n", 862 | "training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'\n", 863 | "train_dataset.save_to_disk(training_input_path, fs=s3)\n", 864 | "\n", 865 | "# save eval_dataset to s3\n", 866 | "eval_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/validation'\n", 867 | "eval_dataset.save_to_disk(eval_input_path, fs=s3)" 868 | ] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": null, 873 | "metadata": { 874 | "scrolled": true 875 | }, 876 | "outputs": [], 877 | "source": [ 878 | "!pygmentize ./scripts/02_classification_train.py" 879 | ] 880 | }, 881 | { 882 | "cell_type": "code", 883 | "execution_count": null, 884 | "metadata": {}, 885 | "outputs": [], 886 | "source": [ 887 | "from sagemaker.huggingface import HuggingFace\n", 888 | "import time\n", 889 | "\n", 890 | "batch_size = 64\n", 891 | "logging_steps = len(emotions_encoded[\"train\"]) // batch_size\n", 892 | "model_name = f\"{model_ckpt}-finetuned-emotion\"\n", 893 | "\n", 894 | "# hyperparameters, which are passed into the training job\n", 895 | "hyperparameters={'model_id':model_ckpt,\n", 896 | " 'num_train_epochs':2,\n", 897 | " 'learning_rate':2e-5,\n", 898 | " 'per_device_train_batch_size':batch_size,\n", 899 | " 'per_device_eval_batch_size':batch_size,\n", 900 | " 'learning_rate':2e-5,\n", 901 | " 'weight_decay':0.01,\n", 902 | " 'evaluation_strategy':\"epoch\",\n", 903 | " 'disable_tqdm':False,\n", 904 | " 'logging_steps':logging_steps,\n", 905 | " 'push_to_hub':True,\n", 906 | " 'hub_model_id':username + '/' + model_name,\n", 907 | " 'hub_strategy':\"every_save\",\n", 908 | " 'hub_token':hub_token\n", 909 | " }\n", 910 | "\n", 911 | "\n" 912 | ] 913 | }, 914 | { 915 | "cell_type": "code", 916 | "execution_count": null, 917 | "metadata": {}, 918 | "outputs": [], 919 | "source": [ 920 | "# define Training Job Name \n", 921 | "job_name = f'nlp-book-sagemaker-02classificaton-{time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.localtime())}'\n", 922 | "\n", 923 | "# create the Estimator\n", 924 | "huggingface_estimator = HuggingFace(\n", 925 | " entry_point = '02_classification_train.py', # fine-tuning script used in training jon\n", 926 | " source_dir = './scripts', # directory where fine-tuning script is stored\n", 927 | " instance_type = 'ml.p3.2xlarge', # instances type used for the training job\n", 928 | " instance_count = 1, # the number of instances used for training\n", 929 | " base_job_name = job_name, # the name of the training job\n", 930 | " role = role, # IAM role used in training job to access AWS ressources, e.g. Amazon S3\n", 931 | " transformers_version = '4.11', # the transformers version used in the training job\n", 932 | " pytorch_version = '1.9', # the pytorch_version version used in the training job\n", 933 | " py_version = 'py38', # the python version used in the training job\n", 934 | " hyperparameters = hyperparameters, # the hyperparameter used for running the training job\n", 935 | ")" 936 | ] 937 | }, 938 | { 939 | "cell_type": "code", 940 | "execution_count": null, 941 | "metadata": { 942 | "scrolled": true 943 | }, 944 | "outputs": [], 945 | "source": [ 946 | "# define a data input dictonary with our uploaded s3 uris\n", 947 | "data = {\n", 948 | " 'train': training_input_path,\n", 949 | " 'test': eval_input_path\n", 950 | "}\n", 951 | "\n", 952 | "# starting the train job with our uploaded datasets as input\n", 953 | "huggingface_estimator.fit(data, wait=True)" 954 | ] 955 | }, 956 | { 957 | "cell_type": "markdown", 958 | "metadata": {}, 959 | "source": [ 960 | "The logs can be found in Amazon CloudWatch: https://console.aws.amazon.com/cloudwatch/home#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FTrainingJobs" 961 | ] 962 | }, 963 | { 964 | "cell_type": "code", 965 | "execution_count": null, 966 | "metadata": {}, 967 | "outputs": [], 968 | "source": [ 969 | "# the model is saved in the S3 bucket and was also pushed to the hugging face hub.\n", 970 | "print(huggingface_estimator.model_data)" 971 | ] 972 | }, 973 | { 974 | "cell_type": "code", 975 | "execution_count": null, 976 | "metadata": {}, 977 | "outputs": [], 978 | "source": [ 979 | "from transformers import Trainer, AutoModel\n", 980 | "\n", 981 | "# we load the model from the hub to the trainer and do further analyses.\n", 982 | "\n", 983 | "model_finetuned = AutoModelForSequenceClassification.from_pretrained('simonmesserli' + '/' + model_name)\n", 984 | "\n", 985 | "trainer = Trainer(model = model_finetuned)" 986 | ] 987 | }, 988 | { 989 | "cell_type": "markdown", 990 | "metadata": {}, 991 | "source": [ 992 | "### Deploy model with SageMaker Endpoint" 993 | ] 994 | }, 995 | { 996 | "cell_type": "code", 997 | "execution_count": null, 998 | "metadata": { 999 | "scrolled": true 1000 | }, 1001 | "outputs": [], 1002 | "source": [ 1003 | "predictor = huggingface_estimator.deploy(1,\"ml.g4dn.xlarge\")" 1004 | ] 1005 | }, 1006 | { 1007 | "cell_type": "code", 1008 | "execution_count": null, 1009 | "metadata": {}, 1010 | "outputs": [], 1011 | "source": [ 1012 | "custom_tweet = {\"inputs\" : \"I saw a movie today and it was really good.\"}\n", 1013 | "predictor.predict(custom_tweet)" 1014 | ] 1015 | }, 1016 | { 1017 | "cell_type": "markdown", 1018 | "metadata": {}, 1019 | "source": [ 1020 | "After running your requests, make sure to delete your endpoint." 1021 | ] 1022 | }, 1023 | { 1024 | "cell_type": "code", 1025 | "execution_count": null, 1026 | "metadata": {}, 1027 | "outputs": [], 1028 | "source": [ 1029 | "predictor.delete_endpoint()" 1030 | ] 1031 | }, 1032 | { 1033 | "cell_type": "code", 1034 | "execution_count": null, 1035 | "metadata": {}, 1036 | "outputs": [], 1037 | "source": [ 1038 | "# hide_output\n", 1039 | "preds_output = trainer.predict(emotions_encoded[\"validation\"])" 1040 | ] 1041 | }, 1042 | { 1043 | "cell_type": "code", 1044 | "execution_count": null, 1045 | "metadata": {}, 1046 | "outputs": [], 1047 | "source": [ 1048 | "preds_output.metrics" 1049 | ] 1050 | }, 1051 | { 1052 | "cell_type": "code", 1053 | "execution_count": null, 1054 | "metadata": {}, 1055 | "outputs": [], 1056 | "source": [ 1057 | "y_preds = np.argmax(preds_output.predictions, axis=1)" 1058 | ] 1059 | }, 1060 | { 1061 | "cell_type": "code", 1062 | "execution_count": null, 1063 | "metadata": {}, 1064 | "outputs": [], 1065 | "source": [ 1066 | "plot_confusion_matrix(y_preds, y_valid, labels)" 1067 | ] 1068 | }, 1069 | { 1070 | "cell_type": "markdown", 1071 | "metadata": {}, 1072 | "source": [ 1073 | "#### Error analysis" 1074 | ] 1075 | }, 1076 | { 1077 | "cell_type": "code", 1078 | "execution_count": null, 1079 | "metadata": {}, 1080 | "outputs": [], 1081 | "source": [ 1082 | "from torch.nn.functional import cross_entropy\n", 1083 | "\n", 1084 | "def forward_pass_with_label(batch):\n", 1085 | " # Place all input tensors on the same device as the model\n", 1086 | " inputs = {k:v.to(device) for k,v in batch.items() \n", 1087 | " if k in tokenizer.model_input_names}\n", 1088 | "\n", 1089 | " with torch.no_grad():\n", 1090 | " output = model(**inputs)\n", 1091 | " pred_label = torch.argmax(output.logits, axis=-1)\n", 1092 | " loss = cross_entropy(output.logits, batch[\"label\"].to(device), \n", 1093 | " reduction=\"none\")\n", 1094 | "\n", 1095 | " # Place outputs on CPU for compatibility with other dataset columns \n", 1096 | " return {\"loss\": loss.cpu().numpy(), \n", 1097 | " \"predicted_label\": pred_label.cpu().numpy()}" 1098 | ] 1099 | }, 1100 | { 1101 | "cell_type": "code", 1102 | "execution_count": null, 1103 | "metadata": {}, 1104 | "outputs": [], 1105 | "source": [ 1106 | "#hide_output\n", 1107 | "# Convert our dataset back to PyTorch tensors\n", 1108 | "emotions_encoded.set_format(\"torch\", \n", 1109 | " columns=[\"input_ids\", \"attention_mask\", \"label\"])\n", 1110 | "# Compute loss values\n", 1111 | "emotions_encoded[\"validation\"] = emotions_encoded[\"validation\"].map(\n", 1112 | " forward_pass_with_label, batched=True, batch_size=16)" 1113 | ] 1114 | }, 1115 | { 1116 | "cell_type": "code", 1117 | "execution_count": null, 1118 | "metadata": {}, 1119 | "outputs": [], 1120 | "source": [ 1121 | "emotions_encoded.set_format(\"pandas\")\n", 1122 | "cols = [\"text\", \"label\", \"predicted_label\", \"loss\"]\n", 1123 | "df_test = emotions_encoded[\"validation\"][:][cols]\n", 1124 | "df_test[\"label\"] = df_test[\"label\"].apply(label_int2str)\n", 1125 | "df_test[\"predicted_label\"] = (df_test[\"predicted_label\"]\n", 1126 | " .apply(label_int2str))" 1127 | ] 1128 | }, 1129 | { 1130 | "cell_type": "code", 1131 | "execution_count": null, 1132 | "metadata": {}, 1133 | "outputs": [], 1134 | "source": [ 1135 | "#hide_output\n", 1136 | "df_test.sort_values(\"loss\", ascending=False).head(10)" 1137 | ] 1138 | }, 1139 | { 1140 | "cell_type": "code", 1141 | "execution_count": null, 1142 | "metadata": {}, 1143 | "outputs": [], 1144 | "source": [ 1145 | "#hide_output\n", 1146 | "df_test.sort_values(\"loss\", ascending=True).head(10)" 1147 | ] 1148 | }, 1149 | { 1150 | "cell_type": "markdown", 1151 | "metadata": {}, 1152 | "source": [ 1153 | "#### Saving and sharing the model" 1154 | ] 1155 | }, 1156 | { 1157 | "cell_type": "code", 1158 | "execution_count": null, 1159 | "metadata": {}, 1160 | "outputs": [], 1161 | "source": [ 1162 | "#hide_output\n", 1163 | "from transformers import pipeline\n", 1164 | "\n", 1165 | "# Change `simonmesserli` to your Hub username\n", 1166 | "model_id = \"simonmesserli/distilbert-base-uncased-finetuned-emotion\"\n", 1167 | "classifier = pipeline(\"text-classification\", model=model_id)" 1168 | ] 1169 | }, 1170 | { 1171 | "cell_type": "code", 1172 | "execution_count": null, 1173 | "metadata": {}, 1174 | "outputs": [], 1175 | "source": [ 1176 | "custom_tweet = \"I saw a movie today and it was really good.\"\n", 1177 | "preds = classifier(custom_tweet, return_all_scores=True)" 1178 | ] 1179 | }, 1180 | { 1181 | "cell_type": "code", 1182 | "execution_count": null, 1183 | "metadata": {}, 1184 | "outputs": [], 1185 | "source": [ 1186 | "preds_df = pd.DataFrame(preds[0])\n", 1187 | "plt.bar(labels, 100 * preds_df[\"score\"], color='C0')\n", 1188 | "plt.title(f'\"{custom_tweet}\"')\n", 1189 | "plt.ylabel(\"Class probability (%)\")\n", 1190 | "plt.show()" 1191 | ] 1192 | }, 1193 | { 1194 | "cell_type": "markdown", 1195 | "metadata": {}, 1196 | "source": [ 1197 | "## Conclusion" 1198 | ] 1199 | } 1200 | ], 1201 | "metadata": { 1202 | "kernelspec": { 1203 | "display_name": "Python 3.9.13 64-bit", 1204 | "language": "python", 1205 | "name": "python3" 1206 | }, 1207 | "language_info": { 1208 | "codemirror_mode": { 1209 | "name": "ipython", 1210 | "version": 3 1211 | }, 1212 | "file_extension": ".py", 1213 | "mimetype": "text/x-python", 1214 | "name": "python", 1215 | "nbconvert_exporter": "python", 1216 | "pygments_lexer": "ipython3", 1217 | "version": "3.9.13" 1218 | }, 1219 | "vscode": { 1220 | "interpreter": { 1221 | "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" 1222 | } 1223 | } 1224 | }, 1225 | "nbformat": 4, 1226 | "nbformat_minor": 4 1227 | } 1228 | -------------------------------------------------------------------------------- /SageMaker/README.md: -------------------------------------------------------------------------------- 1 | # Run the Hugging Face notebooks on SageMaker 2 | 3 | 1/ 4 | Open Amazon SageMaker Notebook Instances in your preferred AWS region: 5 | https://console.aws.amazon.com/sagemaker/home#/notebook-instances 6 | 7 | 2/ 8 | Click **Create notebook instance**. 9 | 10 | 3/ 11 | **Choose an instance type:** 12 | Any ml.t3.* instance or if you want to have an instance with GPU take e.g. ml.g4dn.xlarge. \ 13 | Overview: https://aws.amazon.com/sagemaker/pricing/ 14 | 15 | notebook-config 16 | 17 | 4/ 18 | Choose **notebook-al2-v1** and add more storage volume, e.g. 50 GB. 19 | 20 | 5/ (optional) 21 | If you plan to use JupyterLab, make sure to add a Lifecycle configuration with the following code. 22 | Or execute the code in a terminal in JupyterLab. 23 | 24 | ` 25 | sudo -u ec2-user -i <<'EOF' 26 | EXTENSION_NAME=@jupyter-widgets/jupyterlab-manager 27 | source /home/ec2-user/anaconda3/bin/activate JupyterSystemEnv 28 | jupyter labextension install $EXTENSION_NAME 29 | source /home/ec2-user/anaconda3/bin/deactivate 30 | EOF 31 | ` 32 | 33 | 5/ 34 | Create a new IAM role, which will used in the notebooks to access AWS resources. 35 | 36 | iam-role 37 | 38 | 6/ 39 | Add this Git repository, which will be cloned to your notebook by selecting *Clone a public Git repository to this notebook instance only*. 40 | Paste the Git repository URL to the next field. 41 | 42 | git-repo 43 | 44 | 7/ 45 | The notebook instance will now created and after you click **Open Jupyter** you see the cloned notebooks. 46 | Navigate to the folder *SageMaker*, open the first chapter and execute one cell after the other. -------------------------------------------------------------------------------- /SageMaker/images/git_repo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/SageMaker/images/git_repo.png -------------------------------------------------------------------------------- /SageMaker/images/iam_role.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/SageMaker/images/iam_role.png -------------------------------------------------------------------------------- /SageMaker/images/notebook_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/SageMaker/images/notebook_config.png -------------------------------------------------------------------------------- /SageMaker/scripts/02_classification_train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import random 5 | import sys 6 | 7 | import numpy as np 8 | import torch 9 | from datasets import load_from_disk, load_metric 10 | from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments 11 | from transformers.trainer_utils import get_last_checkpoint 12 | 13 | if __name__ == "__main__": 14 | 15 | parser = argparse.ArgumentParser() 16 | 17 | # hyperparameters sent by the client are passed as command-line arguments to the script. 18 | parser.add_argument("--model_id", type=str) 19 | parser.add_argument("--num_train_epochs", type=int, default=2) 20 | parser.add_argument("--learning_rate", type=str, default=2e-5) 21 | parser.add_argument("--per_device_train_batch_size", type=int, default=64) 22 | parser.add_argument("--per_device_eval_batch_size", type=int, default=64) 23 | parser.add_argument("--weight_decay", type=float, default=0.01) 24 | parser.add_argument("--evaluation_strategy", type=str, default="epoch") 25 | parser.add_argument("--disable_tqdm", type=bool, default=False) 26 | parser.add_argument("--logging_steps", type=int, default=100) 27 | 28 | # Push to Hub Parameters 29 | parser.add_argument("--push_to_hub", type=bool, default=True) 30 | parser.add_argument("--hub_model_id", type=str, default=None) 31 | parser.add_argument("--hub_strategy", type=str, default=None) 32 | parser.add_argument("--hub_token", type=str, default=None) 33 | 34 | #check with L. 35 | # parser.add_argument("--warmup_steps", type=int, default=500) #check with L. 36 | # parser.add_argument("--fp16", type=bool, default=True) #check with L. 37 | 38 | # Data, model, and output directories 39 | parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) 40 | parser.add_argument("--output_dir", type=str, default=os.environ["SM_MODEL_DIR"]) 41 | parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"]) 42 | parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) 43 | parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"]) 44 | 45 | args, _ = parser.parse_known_args() 46 | 47 | # make sure we have required parameters to push 48 | if args.push_to_hub: 49 | if args.hub_strategy is None: 50 | raise ValueError("--hub_strategy is required when pushing to Hub") 51 | if args.hub_token is None: 52 | raise ValueError("--hub_token is required when pushing to Hub") 53 | 54 | # sets hub id if not provided 55 | if args.hub_model_id is None: 56 | args.hub_model_id = args.model_id.replace("/", "--") 57 | 58 | # Set up logging 59 | logger = logging.getLogger(__name__) 60 | 61 | logging.basicConfig( 62 | level=logging.getLevelName("INFO"), 63 | handlers=[logging.StreamHandler(sys.stdout)], 64 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 65 | ) 66 | 67 | # load datasets 68 | train_dataset = load_from_disk(args.training_dir) 69 | test_dataset = load_from_disk(args.test_dir) 70 | 71 | logger.info(f" loaded train_dataset length is: {len(train_dataset)}") 72 | logger.info(f" loaded test_dataset length is: {len(test_dataset)}") 73 | 74 | metric = load_metric("accuracy") 75 | 76 | def compute_metrics(eval_pred): 77 | predictions, labels = eval_pred 78 | predictions = np.argmax(predictions, axis=1) 79 | return metric.compute(predictions=predictions, references=labels) 80 | 81 | # Prepare model labels - useful in inference API 82 | labels = train_dataset.features["label"].names 83 | num_labels = len(labels) 84 | label2id, id2label = dict(), dict() 85 | for i, label in enumerate(labels): 86 | label2id[label] = str(i) 87 | id2label[str(i)] = label 88 | 89 | # download model from model hub 90 | model = AutoModelForSequenceClassification.from_pretrained( 91 | args.model_id, num_labels=num_labels, label2id=label2id, id2label=id2label 92 | ) 93 | tokenizer = AutoTokenizer.from_pretrained(args.model_id) 94 | 95 | # define training args 96 | training_args = TrainingArguments( 97 | output_dir=args.output_dir, 98 | overwrite_output_dir=True if get_last_checkpoint(args.output_dir) is not None else False, 99 | num_train_epochs=args.num_train_epochs, 100 | learning_rate=float(args.learning_rate), 101 | per_device_train_batch_size=args.per_device_train_batch_size, 102 | per_device_eval_batch_size=args.per_device_eval_batch_size, 103 | weight_decay=args.weight_decay, 104 | evaluation_strategy=args.evaluation_strategy, 105 | disable_tqdm=args.disable_tqdm, 106 | logging_steps=args.logging_steps, 107 | # push to hub parameters 108 | push_to_hub=args.push_to_hub, 109 | hub_strategy=args.hub_strategy, 110 | hub_model_id=args.hub_model_id, 111 | hub_token=args.hub_token, 112 | save_strategy="epoch", 113 | save_total_limit=2, 114 | logging_dir=f"{args.output_data_dir}/logs", 115 | load_best_model_at_end=True, 116 | metric_for_best_model="accuracy" 117 | 118 | #warmup_steps=args.warmup_steps, 119 | #fp16=args.fp16, 120 | ) 121 | 122 | # create Trainer instance 123 | trainer = Trainer( 124 | model=model, 125 | args=training_args, 126 | compute_metrics=compute_metrics, 127 | train_dataset=train_dataset, 128 | eval_dataset=test_dataset, 129 | tokenizer=tokenizer, 130 | ) 131 | 132 | # train model 133 | if get_last_checkpoint(args.output_dir) is not None: 134 | logger.info("***** continue training *****") 135 | last_checkpoint = get_last_checkpoint(args.output_dir) 136 | trainer.train(resume_from_checkpoint=last_checkpoint) 137 | else: 138 | trainer.train() 139 | 140 | # evaluate model 141 | eval_result = trainer.evaluate(eval_dataset=test_dataset) 142 | 143 | # writes eval result to file which can be accessed later in s3 ouput 144 | with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer: 145 | print(f"***** Eval results *****") 146 | for key, value in sorted(eval_result.items()): 147 | writer.write(f"{key} = {value}\n") 148 | print(f"{key} = {value}\n") 149 | 150 | # Saves the model to s3 uses os.environ["SM_MODEL_DIR"] to make sure checkpointing works 151 | trainer.save_model(os.environ["SM_MODEL_DIR"]) -------------------------------------------------------------------------------- /SageMaker/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from textwrap import TextWrapper 4 | 5 | import datasets 6 | import huggingface_hub 7 | import matplotlib.font_manager as font_manager 8 | import matplotlib.pyplot as plt 9 | import torch 10 | import transformers 11 | from IPython.display import set_matplotlib_formats 12 | 13 | # TODO: Consider adding SageMaker StudioLab 14 | is_colab = "google.colab" in sys.modules 15 | is_kaggle = "kaggle_secrets" in sys.modules 16 | is_gpu_available = torch.cuda.is_available() 17 | 18 | 19 | def install_mpl_fonts(): 20 | font_dir = ["../orm_fonts/"] 21 | for font in font_manager.findSystemFonts(font_dir): 22 | font_manager.fontManager.addfont(font) 23 | 24 | 25 | def set_plot_style(): 26 | #install_mpl_fonts() 27 | set_matplotlib_formats("pdf", "svg") 28 | #plt.style.use("plotting.mplstyle") 29 | logging.getLogger("matplotlib").setLevel(level=logging.ERROR) 30 | 31 | 32 | def display_library_version(library): 33 | print(f"Using {library.__name__} v{library.__version__}") 34 | 35 | 36 | def setup_chapter(): 37 | # Check if we have a GPU 38 | if not is_gpu_available: 39 | print("No GPU was detected! This notebook can be *very* slow without a GPU 🐢") 40 | if is_colab: 41 | print("Go to Runtime > Change runtime type and select a GPU hardware accelerator.") 42 | if is_kaggle: 43 | print("Go to Settings > Accelerator and select GPU.") 44 | # Give visibility on versions of the core libraries 45 | display_library_version(transformers) 46 | display_library_version(datasets) 47 | # Disable all info / warning messages 48 | transformers.logging.set_verbosity_error() 49 | datasets.logging.set_verbosity_error() 50 | # Logging is only available for the chapters that don't depend on Haystack 51 | if huggingface_hub.__version__ == "0.0.19": 52 | huggingface_hub.logging.set_verbosity_error() 53 | # Use O'Reilly style for plots 54 | set_plot_style() 55 | 56 | 57 | def wrap_print_text(print): 58 | """Adapted from: https://stackoverflow.com/questions/27621655/how-to-overload-print-function-to-expand-its-functionality/27621927""" 59 | 60 | def wrapped_func(text): 61 | if not isinstance(text, str): 62 | text = str(text) 63 | wrapper = TextWrapper( 64 | width=80, 65 | break_long_words=True, 66 | break_on_hyphens=False, 67 | replace_whitespace=False, 68 | ) 69 | return print("\n".join(wrapper.fill(line) for line in text.split("\n"))) 70 | 71 | return wrapped_func 72 | 73 | 74 | print = wrap_print_text(print) 75 | -------------------------------------------------------------------------------- /environment-chapter7.yml: -------------------------------------------------------------------------------- 1 | name: book-chapter7 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - python=3.9 7 | - cudatoolkit=11.0 8 | - pip 9 | - notebook 10 | - ipykernel 11 | - pip: 12 | - farm-haystack==0.9.0 13 | - datasets==1.11.0 14 | - matplotlib 15 | - ipywidgets -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: book 2 | channels: 3 | - pytorch 4 | - defaults 5 | - pyg 6 | - conda-forge 7 | dependencies: 8 | - python=3.9 9 | - cudatoolkit=11.3 10 | - pytorch-scatter 11 | - pip 12 | - notebook 13 | - ipykernel 14 | - ipywidgets 15 | - git-lfs 16 | - libsndfile 17 | - pip: 18 | - -r requirements.txt 19 | -------------------------------------------------------------------------------- /images/book_cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/book_cover.jpg -------------------------------------------------------------------------------- /images/chapter01_enc-dec-attn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_enc-dec-attn.png -------------------------------------------------------------------------------- /images/chapter01_enc-dec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_enc-dec.png -------------------------------------------------------------------------------- /images/chapter01_hf-ecosystem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_hf-ecosystem.png -------------------------------------------------------------------------------- /images/chapter01_hub-model-card.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_hub-model-card.png -------------------------------------------------------------------------------- /images/chapter01_hub-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_hub-overview.png -------------------------------------------------------------------------------- /images/chapter01_rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_rnn.png -------------------------------------------------------------------------------- /images/chapter01_self-attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_self-attention.png -------------------------------------------------------------------------------- /images/chapter01_timeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_timeline.png -------------------------------------------------------------------------------- /images/chapter01_transfer-learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_transfer-learning.png -------------------------------------------------------------------------------- /images/chapter01_ulmfit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter01_ulmfit.png -------------------------------------------------------------------------------- /images/chapter02_attention-alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_attention-alignment.png -------------------------------------------------------------------------------- /images/chapter02_attention-mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_attention-mask.png -------------------------------------------------------------------------------- /images/chapter02_encoder-classifier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_encoder-classifier.png -------------------------------------------------------------------------------- /images/chapter02_encoder-feature-based.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_encoder-feature-based.png -------------------------------------------------------------------------------- /images/chapter02_encoder-fine-tuning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_encoder-fine-tuning.png -------------------------------------------------------------------------------- /images/chapter02_hf-libraries.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_hf-libraries.png -------------------------------------------------------------------------------- /images/chapter02_transformers-compact.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | chapter02_transformers-compact.html 6 | 7 | 8 | 9 |
10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /images/chapter02_transformers.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | chapter02_transformers.html 6 | 7 | 8 | 9 |
10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /images/chapter02_transformers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_transformers.png -------------------------------------------------------------------------------- /images/chapter02_tweet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter02_tweet.png -------------------------------------------------------------------------------- /images/chapter03_attention-ops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_attention-ops.png -------------------------------------------------------------------------------- /images/chapter03_contextualized-embedding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_contextualized-embedding.png -------------------------------------------------------------------------------- /images/chapter03_decoder-zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_decoder-zoom.png -------------------------------------------------------------------------------- /images/chapter03_encoder-zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_encoder-zoom.png -------------------------------------------------------------------------------- /images/chapter03_layer-norm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_layer-norm.png -------------------------------------------------------------------------------- /images/chapter03_multihead-attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_multihead-attention.png -------------------------------------------------------------------------------- /images/chapter03_transformer-encoder-decoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_transformer-encoder-decoder.png -------------------------------------------------------------------------------- /images/chapter03_transformers-compact.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter03_transformers-compact.png -------------------------------------------------------------------------------- /images/chapter04_bert-body-head.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter04_bert-body-head.png -------------------------------------------------------------------------------- /images/chapter04_clf-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter04_clf-architecture.png -------------------------------------------------------------------------------- /images/chapter04_ner-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter04_ner-architecture.png -------------------------------------------------------------------------------- /images/chapter04_ner-widget.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter04_ner-widget.png -------------------------------------------------------------------------------- /images/chapter04_tokenizer-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter04_tokenizer-pipeline.png -------------------------------------------------------------------------------- /images/chapter05_beam-search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter05_beam-search.png -------------------------------------------------------------------------------- /images/chapter05_lm-meta-learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter05_lm-meta-learning.png -------------------------------------------------------------------------------- /images/chapter05_meena.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter05_meena.png -------------------------------------------------------------------------------- /images/chapter05_text-generation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter05_text-generation.png -------------------------------------------------------------------------------- /images/chapter07_dpr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_dpr.png -------------------------------------------------------------------------------- /images/chapter07_marie-curie.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_marie-curie.png -------------------------------------------------------------------------------- /images/chapter07_phone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_phone.png -------------------------------------------------------------------------------- /images/chapter07_qa-head.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_qa-head.png -------------------------------------------------------------------------------- /images/chapter07_qa-pyramid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_qa-pyramid.png -------------------------------------------------------------------------------- /images/chapter07_rag-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_rag-architecture.png -------------------------------------------------------------------------------- /images/chapter07_retriever-reader.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_retriever-reader.png -------------------------------------------------------------------------------- /images/chapter07_sliding-window.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_sliding-window.png -------------------------------------------------------------------------------- /images/chapter07_squad-models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_squad-models.png -------------------------------------------------------------------------------- /images/chapter07_squad-schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_squad-schema.png -------------------------------------------------------------------------------- /images/chapter07_squad-sota.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter07_squad-sota.png -------------------------------------------------------------------------------- /images/chapter08_bert-onnx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_bert-onnx.png -------------------------------------------------------------------------------- /images/chapter08_fp32-to-int8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_fp32-to-int8.png -------------------------------------------------------------------------------- /images/chapter08_kd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_kd.png -------------------------------------------------------------------------------- /images/chapter08_magnitude-vs-movement.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_magnitude-vs-movement.png -------------------------------------------------------------------------------- /images/chapter08_network-pruning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_network-pruning.png -------------------------------------------------------------------------------- /images/chapter08_onnx-ort.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_onnx-ort.png -------------------------------------------------------------------------------- /images/chapter08_oos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_oos.png -------------------------------------------------------------------------------- /images/chapter08_pegasus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_pegasus.png -------------------------------------------------------------------------------- /images/chapter08_pruning-dists.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_pruning-dists.png -------------------------------------------------------------------------------- /images/chapter08_roblox.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_roblox.png -------------------------------------------------------------------------------- /images/chapter08_soft-probs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_soft-probs.png -------------------------------------------------------------------------------- /images/chapter08_t5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter08_t5.png -------------------------------------------------------------------------------- /images/chapter09_decision-tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter09_decision-tree.png -------------------------------------------------------------------------------- /images/chapter09_faiss-index.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter09_faiss-index.png -------------------------------------------------------------------------------- /images/chapter09_issue-example-v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter09_issue-example-v2.png -------------------------------------------------------------------------------- /images/chapter09_nearest-neighbours.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter09_nearest-neighbours.png -------------------------------------------------------------------------------- /images/chapter09_uda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter09_uda.png -------------------------------------------------------------------------------- /images/chapter09_ust.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter09_ust.png -------------------------------------------------------------------------------- /images/chapter10_code-snippet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter10_code-snippet.png -------------------------------------------------------------------------------- /images/chapter10_ddp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter10_ddp.png -------------------------------------------------------------------------------- /images/chapter10_preprocessing-clm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter10_preprocessing-clm.png -------------------------------------------------------------------------------- /images/chapter10_pretraining-clm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter10_pretraining-clm.png -------------------------------------------------------------------------------- /images/chapter10_pretraining-mlm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter10_pretraining-mlm.png -------------------------------------------------------------------------------- /images/chapter10_pretraining-seq2seq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter10_pretraining-seq2seq.png -------------------------------------------------------------------------------- /images/chapter11_atomic-sparse-attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_atomic-sparse-attention.png -------------------------------------------------------------------------------- /images/chapter11_clip-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_clip-arch.png -------------------------------------------------------------------------------- /images/chapter11_compound-sparse-attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_compound-sparse-attention.png -------------------------------------------------------------------------------- /images/chapter11_dall-e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_dall-e.png -------------------------------------------------------------------------------- /images/chapter11_efficient-attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_efficient-attention.png -------------------------------------------------------------------------------- /images/chapter11_iGPT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_iGPT.png -------------------------------------------------------------------------------- /images/chapter11_layoutlm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_layoutlm.png -------------------------------------------------------------------------------- /images/chapter11_linear-attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_linear-attention.png -------------------------------------------------------------------------------- /images/chapter11_scaling-modal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_scaling-modal.png -------------------------------------------------------------------------------- /images/chapter11_scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_scaling.png -------------------------------------------------------------------------------- /images/chapter11_table-qa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_table-qa.png -------------------------------------------------------------------------------- /images/chapter11_tapas-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_tapas-architecture.png -------------------------------------------------------------------------------- /images/chapter11_vit-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_vit-architecture.png -------------------------------------------------------------------------------- /images/chapter11_vqa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_vqa.png -------------------------------------------------------------------------------- /images/chapter11_wav2vec-u.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_wav2vec-u.png -------------------------------------------------------------------------------- /images/chapter11_wav2vec2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/chapter11_wav2vec2.png -------------------------------------------------------------------------------- /images/doge.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/doge.jpg -------------------------------------------------------------------------------- /images/optimusprime.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-with-transformers/notebooks/0cb211095b4622fa922f80fbdc9d83cc5d9e0c34/images/optimusprime.jpg -------------------------------------------------------------------------------- /install.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | 4 | is_colab = "google.colab" in sys.modules 5 | is_kaggle = "kaggle_secrets" in sys.modules 6 | # torch-scatter binaries depend on the torch and CUDA version, so we define the 7 | # mappings here for Colab & Kaggle 8 | torch_to_cuda = {"1.10.0": "cu113", "1.9.0": "cu111", "1.9.1": "cu111"} 9 | 10 | 11 | def install_requirements( 12 | is_chapter2: bool = False, 13 | is_chapter6: bool = False, 14 | is_chapter7: bool = False, 15 | is_chapter7_v2: bool = False, 16 | is_chapter10: bool = False, 17 | is_chapter11: bool = False 18 | ): 19 | """Installs the required packages for the project.""" 20 | 21 | print("⏳ Installing base requirements ...") 22 | cmd = ["python", "-m", "pip", "install", "-r"] 23 | if is_chapter7: 24 | cmd += "requirements-chapter7.txt -f https://download.pytorch.org/whl/torch_stable.html".split() 25 | elif is_chapter7_v2: 26 | cmd.append("requirements-chapter7-v2.txt") 27 | else: 28 | cmd.append("requirements.txt") 29 | process_install = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 30 | if process_install.returncode != 0: 31 | raise Exception("😭 Failed to install base requirements") 32 | else: 33 | print("✅ Base requirements installed!") 34 | print("⏳ Installing Git LFS ...") 35 | process_lfs = subprocess.run(["apt", "install", "git-lfs"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 36 | if process_lfs.returncode == -1: 37 | raise Exception("😭 Failed to install Git LFS and soundfile") 38 | else: 39 | print("✅ Git LFS installed!") 40 | 41 | if is_chapter2: 42 | transformers_cmd = "python -m pip install transformers==4.13.0 datasets==2.8.0".split() 43 | process_scatter = subprocess.run( 44 | transformers_cmd, 45 | stdout=subprocess.PIPE, 46 | stderr=subprocess.PIPE, 47 | ) 48 | 49 | if is_chapter6: 50 | transformers_cmd = "python -m pip install datasets==2.0.0".split() 51 | process_scatter = subprocess.run( 52 | transformers_cmd, 53 | stdout=subprocess.PIPE, 54 | stderr=subprocess.PIPE, 55 | ) 56 | 57 | if is_chapter10: 58 | wandb_cmd = "python -m pip install wandb".split() 59 | process_scatter = subprocess.run( 60 | wandb_cmd, 61 | stdout=subprocess.PIPE, 62 | stderr=subprocess.PIPE, 63 | ) 64 | if is_chapter11: 65 | import torch 66 | 67 | torch_version = torch.__version__.split("+")[0] 68 | print(f"⏳ Installing torch-scatter for torch v{torch_version} ...") 69 | if is_colab: 70 | torch_scatter_cmd = f"python -m pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch_version}+{torch_to_cuda[torch_version]}.html".split() 71 | else: 72 | # Kaggle uses CUDA 11.0 by default, so we need to build from source 73 | torch_scatter_cmd = "python -m pip install torch-scatter".split() 74 | process_scatter = subprocess.run( 75 | torch_scatter_cmd, 76 | stdout=subprocess.PIPE, 77 | stderr=subprocess.PIPE, 78 | ) 79 | if process_scatter.returncode == -1: 80 | raise Exception("😭 Failed to install torch-scatter") 81 | else: 82 | print("torch-scatter installed!") 83 | print("⏳ Installing soundfile ...") 84 | process_audio = subprocess.run( 85 | ["apt", "install", "libsndfile1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE 86 | ) 87 | if process_audio.returncode == -1: 88 | raise Exception("😭 Failed to install soundfile") 89 | else: 90 | print("✅ soundfile installed!") 91 | print("🥳 Chapter installation complete!") 92 | -------------------------------------------------------------------------------- /plotting.mplstyle: -------------------------------------------------------------------------------- 1 | savefig.dpi: 300 2 | figure.figsize: 6, 4 # figure size in inches 3 | 4 | axes.prop_cycle: cycler('color', ['0071bc', 'f7931e', 'c1272d', '009245', 'ffde00', '9900cc']) 5 | 6 | font.size: 12.0 7 | font.family: Guardian Sans Cond 8 | pdf.fonttype: 42 9 | ps.fonttype: 42 -------------------------------------------------------------------------------- /requirements-chapter7-v2.txt: -------------------------------------------------------------------------------- 1 | # Base requirements 2 | farm-haystack[colab]==1.4.0 3 | matplotlib 4 | datasets 5 | -------------------------------------------------------------------------------- /requirements-chapter7.txt: -------------------------------------------------------------------------------- 1 | # Base requirements 2 | farm-haystack==0.9.0 3 | matplotlib 4 | # Colab fix since FARM requires PyTorch v1.8.1 but v1.10.0 is installed by 5 | # default. See: https://github.com/deepset-ai/haystack/issues/1787 6 | torch==1.8.1+cu111 7 | torchvision==0.9.1+cu111 8 | torchaudio==0.8.1 9 | # We need this version because haystack pins 10 | # transformers 4.6.1 which depends on huggingface-hub==0.0.8 11 | # and is incompatible with later versions of datasets 12 | datasets==1.11.0 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Base requirements 2 | transformers[tf,torch,sentencepiece,vision,optuna,sklearn,onnxruntime]==4.16.2 3 | datasets[audio]==1.16.1 4 | matplotlib 5 | ipywidgets 6 | # Chapter 2 - Classification 7 | umap-learn==0.5.1 8 | # Chapter 3 - Anatomy 9 | bertviz==1.2.0 10 | # Chapter 4 - NER 11 | seqeval==1.2.2 12 | # Chapter 6 - Summarization 13 | nltk==3.9 14 | sacrebleu==1.5.1 15 | rouge-score==0.0.4 16 | py7zr # Needed for samsum dataset 17 | # Chapter 9 - Few labels 18 | nlpaug==1.1.7 19 | scikit-multilearn==0.2.0 20 | # Chapter 10 - Pretraining 21 | psutil 22 | accelerate==0.5.1 23 | -------------------------------------------------------------------------------- /scripts/create_notebook_table.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | GITHUB_PATH_PREFIX = "nlp-with-transformers/notebooks/blob/main/" 4 | 5 | CHAPTER_TO_NB = { 6 | "Introduction": "01_introduction", 7 | "Text Classification": "02_classification", 8 | "Transformer Anatomy": "03_transformer-anatomy", 9 | "Multilingual Named Entity Recognition": "04_multilingual-ner", 10 | "Text Generation": "05_text-generation", 11 | "Summarization": "06_summarization", 12 | "Question Answering": "07_question-answering", 13 | "Making Transformers Efficient in Production": "08_model-compression", 14 | "Dealing with Few to No Labels": "09_few-to-no-labels", 15 | "Training Transformers from Scratch": "10_transformers-from-scratch", 16 | "Future Directions": "11_future-directions", 17 | } 18 | 19 | 20 | def _find_text_in_file(filename, start_prompt, end_prompt): 21 | """ 22 | Find the text in `filename` between a line beginning with `start_prompt` and before `end_prompt`, removing empty 23 | lines. 24 | 25 | Copied from: https://github.com/huggingface/transformers/blob/16f0b7d72c6d4e122957392c342b074aa2c5c519/utils/check_table.py#L30 26 | """ 27 | with open(filename, "r", encoding="utf-8", newline="\n") as f: 28 | lines = f.readlines() 29 | # Find the start prompt. 30 | start_index = 0 31 | while not lines[start_index].startswith(start_prompt): 32 | start_index += 1 33 | start_index += 1 34 | 35 | end_index = start_index 36 | while not lines[end_index].startswith(end_prompt): 37 | end_index += 1 38 | end_index -= 1 39 | 40 | while len(lines[start_index]) <= 1: 41 | start_index += 1 42 | while len(lines[end_index]) <= 1: 43 | end_index -= 1 44 | end_index += 1 45 | return "".join(lines[start_index:end_index]), start_index, end_index, lines 46 | 47 | 48 | def create_table(): 49 | data = {"Chapter": [], "Colab": [], "Kaggle": [], "Gradient": [], "Studio Lab": []} 50 | for title, nb in CHAPTER_TO_NB.items(): 51 | nb_path = f"{GITHUB_PATH_PREFIX}{nb}.ipynb" 52 | data["Chapter"].append(title) 53 | data["Colab"].append( 54 | f"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/{nb_path})" 55 | ) 56 | data["Kaggle"].append( 57 | f"[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/{nb_path})" 58 | ) 59 | data["Gradient"].append( 60 | f"[![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/{nb_path})" 61 | ) 62 | data["Studio Lab"].append( 63 | f"[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/{nb_path})" 64 | ) 65 | return pd.DataFrame(data).to_markdown(index=False) + "\n" 66 | 67 | 68 | def main(): 69 | table = create_table() 70 | _, start_index, end_index, lines = _find_text_in_file( 71 | filename="README.md", 72 | start_prompt="", 73 | end_prompt="", 74 | ) 75 | 76 | with open("README.md", "w", encoding="utf-8", newline="\n") as f: 77 | f.writelines(lines[:start_index] + [table] + lines[end_index:]) 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /settings.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | lib_name = notebooks 3 | user = nlp-with-transformers 4 | description = notebooks 5 | keywords = jupyter notebook asciidoc 6 | author = Lewis Tunstall and Leandro von Werra and Thomas Wolf 7 | nbs_path = . 8 | host = github 9 | doc_host = https://nlp-with-transformers.github.io 10 | doc_baseurl = /notebooks/ 11 | 12 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from textwrap import TextWrapper 4 | 5 | import datasets 6 | import huggingface_hub 7 | import matplotlib.font_manager as font_manager 8 | import matplotlib.pyplot as plt 9 | import torch 10 | import transformers 11 | from IPython.display import set_matplotlib_formats 12 | 13 | # TODO: Consider adding SageMaker StudioLab 14 | is_colab = "google.colab" in sys.modules 15 | is_kaggle = "kaggle_secrets" in sys.modules 16 | is_gpu_available = torch.cuda.is_available() 17 | 18 | 19 | def install_mpl_fonts(): 20 | font_dir = ["./orm_fonts/"] 21 | for font in font_manager.findSystemFonts(font_dir): 22 | font_manager.fontManager.addfont(font) 23 | 24 | 25 | def set_plot_style(): 26 | install_mpl_fonts() 27 | set_matplotlib_formats("pdf", "svg") 28 | plt.style.use("plotting.mplstyle") 29 | logging.getLogger("matplotlib").setLevel(level=logging.ERROR) 30 | 31 | 32 | def display_library_version(library): 33 | print(f"Using {library.__name__} v{library.__version__}") 34 | 35 | 36 | def setup_chapter(): 37 | # Check if we have a GPU 38 | if not is_gpu_available: 39 | print("No GPU was detected! This notebook can be *very* slow without a GPU 🐢") 40 | if is_colab: 41 | print("Go to Runtime > Change runtime type and select a GPU hardware accelerator.") 42 | if is_kaggle: 43 | print("Go to Settings > Accelerator and select GPU.") 44 | # Give visibility on versions of the core libraries 45 | display_library_version(transformers) 46 | display_library_version(datasets) 47 | # Disable all info / warning messages 48 | transformers.logging.set_verbosity_error() 49 | datasets.logging.set_verbosity_error() 50 | # Logging is only available for the chapters that don't depend on Haystack 51 | if huggingface_hub.__version__ == "0.0.19": 52 | huggingface_hub.logging.set_verbosity_error() 53 | # Use O'Reilly style for plots 54 | set_plot_style() 55 | 56 | 57 | def wrap_print_text(print): 58 | """Adapted from: https://stackoverflow.com/questions/27621655/how-to-overload-print-function-to-expand-its-functionality/27621927""" 59 | 60 | def wrapped_func(text): 61 | if not isinstance(text, str): 62 | text = str(text) 63 | wrapper = TextWrapper( 64 | width=80, 65 | break_long_words=True, 66 | break_on_hyphens=False, 67 | replace_whitespace=False, 68 | ) 69 | return print("\n".join(wrapper.fill(line) for line in text.split("\n"))) 70 | 71 | return wrapped_func 72 | 73 | 74 | print = wrap_print_text(print) 75 | --------------------------------------------------------------------------------