├── .gitignore
├── README.md
├── sagemaker
    ├── 14_train_and_push_to_hub
    │   ├── README.md
    │   ├── imgs
    │   │   └── emotion-widget.png
    │   └── scripts
    │   │   └── train.py
    ├── 15_training_compiler
    │   ├── imgs
    │   │   └── emotion-widget.png
    │   └── scripts
    │   │   └── train.py
    ├── 13_deploy_and_autoscaling_transformers
    │   └── imgs
    │   │   ├── sm-endpoint.png
    │   │   ├── scaling-options.jpeg
    │   │   ├── autoscaling-endpoint.png
    │   │   ├── hf-inference-toolkit.png
    │   │   └── model-monitoring-dashboard.png
    ├── 01_getting_started_pytorch
    │   └── scripts
    │   │   └── train.py
    ├── 06_sagemaker_metrics
    │   └── scripts
    │   │   └── train.py
    ├── 05_spot_instances
    │   └── scripts
    │   │   └── train.py
    ├── 02_getting_started_tensorflow
    │   └── scripts
    │   │   └── train.py
    └── 09_image_classification_vision_transformer
    │   └── scripts
    │       └── train.py
├── Makefile
├── examples
    └── images
    │   ├── translation.png
    │   ├── summarization.png
    │   ├── model_parameters.png
    │   ├── question_answering.png
    │   ├── text_classification.png
    │   ├── token_classification.png
    │   ├── causal_language_modeling.png
    │   └── masked_language_modeling.png
├── longform-qa
    └── images
    │   ├── fireworks.gif
    │   ├── ELI5animation.gif
    │   └── huggingface_logo.jpg
├── transformers_doc
    ├── imgs
    │   ├── ppl_full.gif
    │   ├── ppl_chunked.gif
    │   └── ppl_sliding.gif
    └── README.md
└── course
    ├── chapter8
        ├── section5.ipynb
        └── section3.ipynb
    ├── chapter1
        └── section8.ipynb
    ├── videos
        ├── pre_tokenization.ipynb
        ├── rouge_metric.ipynb
        ├── perplexity.ipynb
        ├── normalization.ipynb
        ├── offset_mapping.ipynb
        ├── domain_adaptation.ipynb
        ├── bleu_metric.ipynb
        ├── datasets_and_dataframes.ipynb
        ├── fast_tokenizers.ipynb
        ├── debug_error.ipynb
        ├── summarization_processing.ipynb
        ├── clm_processing.ipynb
        ├── load_custom_dataset.ipynb
        ├── debug_training_tf.ipynb
        ├── save_load_dataset.ipynb
        ├── mlm_processing.ipynb
        ├── building_tokenizer.ipynb
        ├── memory_mapping_streaming.ipynb
        ├── train_new_tokenizer.ipynb
        ├── slice_and_dice.ipynb
        ├── custom_loss.ipynb
        ├── token_processing.ipynb
        ├── translation_processing.ipynb
        ├── sentence_pairs_tf.ipynb
        ├── tensorflow_finetuning.ipynb
        ├── semantic_search.ipynb
        └── token_pipeline_pt.ipynb
    ├── chapter4
        ├── section2_pt.ipynb
        └── section2_tf.ipynb
    ├── chapter2
        ├── section3_pt.ipynb
        ├── section3_tf.ipynb
        ├── section4_pt.ipynb
        ├── section4_tf.ipynb
        ├── section6_pt.ipynb
        └── section6_tf.ipynb
    ├── chapter6
        └── section4.ipynb
    ├── chapter5
        └── section2.ipynb
    └── chapter3
        ├── section3.ipynb
        └── section3_tf.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | # Jupyter Notebook
2 | .ipynb_checkpoints
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # notebooks
2 | Notebooks using the Hugging Face libraries 🤗
3 | 


--------------------------------------------------------------------------------
/sagemaker/14_train_and_push_to_hub/README.md:
--------------------------------------------------------------------------------
1 | # SageMaker push to hf.co/models example


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: doc-notebooks
2 | 
3 | doc-notebooks:
4 | 	python utils/convert_doc_to_notebooks.py
5 | 


--------------------------------------------------------------------------------
/examples/images/translation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/translation.png


--------------------------------------------------------------------------------
/longform-qa/images/fireworks.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/longform-qa/images/fireworks.gif


--------------------------------------------------------------------------------
/examples/images/summarization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/summarization.png


--------------------------------------------------------------------------------
/transformers_doc/imgs/ppl_full.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/transformers_doc/imgs/ppl_full.gif


--------------------------------------------------------------------------------
/examples/images/model_parameters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/model_parameters.png


--------------------------------------------------------------------------------
/longform-qa/images/ELI5animation.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/longform-qa/images/ELI5animation.gif


--------------------------------------------------------------------------------
/transformers_doc/imgs/ppl_chunked.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/transformers_doc/imgs/ppl_chunked.gif


--------------------------------------------------------------------------------
/transformers_doc/imgs/ppl_sliding.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/transformers_doc/imgs/ppl_sliding.gif


--------------------------------------------------------------------------------
/examples/images/question_answering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/question_answering.png


--------------------------------------------------------------------------------
/examples/images/text_classification.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/text_classification.png


--------------------------------------------------------------------------------
/longform-qa/images/huggingface_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/longform-qa/images/huggingface_logo.jpg


--------------------------------------------------------------------------------
/examples/images/token_classification.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/token_classification.png


--------------------------------------------------------------------------------
/examples/images/causal_language_modeling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/causal_language_modeling.png


--------------------------------------------------------------------------------
/examples/images/masked_language_modeling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/masked_language_modeling.png


--------------------------------------------------------------------------------
/sagemaker/15_training_compiler/imgs/emotion-widget.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/15_training_compiler/imgs/emotion-widget.png


--------------------------------------------------------------------------------
/sagemaker/14_train_and_push_to_hub/imgs/emotion-widget.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/14_train_and_push_to_hub/imgs/emotion-widget.png


--------------------------------------------------------------------------------
/sagemaker/13_deploy_and_autoscaling_transformers/imgs/sm-endpoint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/sm-endpoint.png


--------------------------------------------------------------------------------
/sagemaker/13_deploy_and_autoscaling_transformers/imgs/scaling-options.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/scaling-options.jpeg


--------------------------------------------------------------------------------
/sagemaker/13_deploy_and_autoscaling_transformers/imgs/autoscaling-endpoint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/autoscaling-endpoint.png


--------------------------------------------------------------------------------
/sagemaker/13_deploy_and_autoscaling_transformers/imgs/hf-inference-toolkit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/hf-inference-toolkit.png


--------------------------------------------------------------------------------
/sagemaker/13_deploy_and_autoscaling_transformers/imgs/model-monitoring-dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/model-monitoring-dashboard.png


--------------------------------------------------------------------------------
/transformers_doc/README.md:
--------------------------------------------------------------------------------
1 | # 🤗 Transformers doc notebooks
2 | 
3 | These notebooks are automatically generated from the [🤗 Transformers documentation](https://huggingface.co/transformers/)
4 | so you should not make any direct modification here. If there is a typo to fix or a sentence to add, open a pull
5 | request in the [🤗 Transformers repo](https://github.com/huggingface/transformers) and fix the corresponding file in
6 | the `docs/source/` folder.
7 | 
8 | If there is something that seems weirdly converted from the original doc file, open an issue in this repo and we will
9 | try to fix the conversion script.


--------------------------------------------------------------------------------
/course/chapter8/section5.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# How to write a good issue"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "Install the Transformers and Datasets libraries to run this notebook."
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "code",
19 |    "execution_count": null,
20 |    "metadata": {},
21 |    "outputs": [],
22 |    "source": [
23 |     "!pip install datasets transformers[sentencepiece]"
24 |    ]
25 |   },
26 |   {
27 |    "cell_type": "code",
28 |    "execution_count": null,
29 |    "metadata": {},
30 |    "outputs": [],
31 |    "source": []
32 |   }
33 |  ],
34 |  "metadata": {
35 |   "colab": {
36 |    "name": "How to write a good issue",
37 |    "provenance": []
38 |   }
39 |  },
40 |  "nbformat": 4,
41 |  "nbformat_minor": 4
42 | }
43 | 


--------------------------------------------------------------------------------
/course/chapter1/section8.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Bias and limitations"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "Install the Transformers and Datasets libraries to run this notebook."
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "code",
19 |    "execution_count": null,
20 |    "metadata": {},
21 |    "outputs": [],
22 |    "source": [
23 |     "!pip install datasets transformers[sentencepiece]"
24 |    ]
25 |   },
26 |   {
27 |    "cell_type": "code",
28 |    "execution_count": null,
29 |    "metadata": {},
30 |    "outputs": [
31 |     {
32 |      "data": {
33 |       "text/plain": [
34 |        "['lawyer', 'carpenter', 'doctor', 'waiter', 'mechanic']\n",
35 |        "['nurse', 'waitress', 'teacher', 'maid', 'prostitute']"
36 |       ]
37 |      },
38 |      "execution_count": null,
39 |      "metadata": {},
40 |      "output_type": "execute_result"
41 |     }
42 |    ],
43 |    "source": [
44 |     "from transformers import pipeline\n",
45 |     "\n",
46 |     "unmasker = pipeline(\"fill-mask\", model=\"bert-base-uncased\")\n",
47 |     "result = unmasker(\"This man works as a [MASK].\")\n",
48 |     "print([r[\"token_str\"] for r in result])\n",
49 |     "\n",
50 |     "result = unmasker(\"This woman works as a [MASK].\")\n",
51 |     "print([r[\"token_str\"] for r in result])"
52 |    ]
53 |   }
54 |  ],
55 |  "metadata": {
56 |   "colab": {
57 |    "name": "Bias and limitations",
58 |    "provenance": []
59 |   }
60 |  },
61 |  "nbformat": 4,
62 |  "nbformat_minor": 4
63 | }
64 | 


--------------------------------------------------------------------------------
/course/videos/pre_tokenization.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {
14 |     "cellView": "form"
15 |    },
16 |    "outputs": [
17 |     {
18 |      "data": {
19 |       "text/html": [
20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/grlLV8AIXug?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
21 |       ],
22 |       "text/plain": [
23 |        "<IPython.core.display.HTML object>"
24 |       ]
25 |      },
26 |      "execution_count": null,
27 |      "metadata": {},
28 |      "output_type": "execute_result"
29 |     }
30 |    ],
31 |    "source": [
32 |     "#@title\n",
33 |     "from IPython.display import HTML\n",
34 |     "\n",
35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/grlLV8AIXug?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
36 |    ]
37 |   },
38 |   {
39 |    "cell_type": "markdown",
40 |    "metadata": {},
41 |    "source": [
42 |     "Install the Transformers and Datasets libraries to run this notebook."
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "code",
47 |    "execution_count": null,
48 |    "metadata": {},
49 |    "outputs": [],
50 |    "source": [
51 |     "! pip install datasets transformers[sentencepiece]"
52 |    ]
53 |   },
54 |   {
55 |    "cell_type": "code",
56 |    "execution_count": null,
57 |    "metadata": {},
58 |    "outputs": [],
59 |    "source": [
60 |     "from transformers import AutoTokenizerFast\n",
61 |     "\n",
62 |     "tokenizer = AutoTokenizerFast.from_pretrained('albert-base-v1’)\n",
63 |     "\n",
64 |     "text = \"3.2.1: let's get started!\"\n",
65 |     "\n",
66 |     "print(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text))"
67 |    ]
68 |   }
69 |  ],
70 |  "metadata": {
71 |   "colab": {
72 |    "name": "What is pre-tokenization?",
73 |    "provenance": []
74 |   }
75 |  },
76 |  "nbformat": 4,
77 |  "nbformat_minor": 4
78 | }
79 | 


--------------------------------------------------------------------------------
/course/videos/rouge_metric.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {
14 |     "cellView": "form"
15 |    },
16 |    "outputs": [
17 |     {
18 |      "data": {
19 |       "text/html": [
20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/TMshhnrEXlg?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
21 |       ],
22 |       "text/plain": [
23 |        "<IPython.core.display.HTML object>"
24 |       ]
25 |      },
26 |      "execution_count": null,
27 |      "metadata": {},
28 |      "output_type": "execute_result"
29 |     }
30 |    ],
31 |    "source": [
32 |     "#@title\n",
33 |     "from IPython.display import HTML\n",
34 |     "\n",
35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/TMshhnrEXlg?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
36 |    ]
37 |   },
38 |   {
39 |    "cell_type": "markdown",
40 |    "metadata": {},
41 |    "source": [
42 |     "Install the Transformers and Datasets libraries to run this notebook."
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "code",
47 |    "execution_count": null,
48 |    "metadata": {},
49 |    "outputs": [],
50 |    "source": [
51 |     "! pip install datasets transformers[sentencepiece]"
52 |    ]
53 |   },
54 |   {
55 |    "cell_type": "code",
56 |    "execution_count": null,
57 |    "metadata": {},
58 |    "outputs": [],
59 |    "source": [
60 |     "! pip install nltk rouge_score"
61 |    ]
62 |   },
63 |   {
64 |    "cell_type": "code",
65 |    "execution_count": null,
66 |    "metadata": {},
67 |    "outputs": [],
68 |    "source": [
69 |     "from datasets import load_metric\n",
70 |     "\n",
71 |     "rouge = load_metric(\"rouge\")\n",
72 |     "predictions = [\"I really loved reading the Hunger Games\"]\n",
73 |     "references = [\"I loved reading the Hunger Games\"]\n",
74 |     "rouge.compute(predictions=predictions, references=references)"
75 |    ]
76 |   },
77 |   {
78 |    "cell_type": "code",
79 |    "execution_count": null,
80 |    "metadata": {},
81 |    "outputs": [],
82 |    "source": []
83 |   }
84 |  ],
85 |  "metadata": {
86 |   "colab": {
87 |    "name": "What is the ROUGE metric?",
88 |    "provenance": []
89 |   }
90 |  },
91 |  "nbformat": 4,
92 |  "nbformat_minor": 4
93 | }
94 | 


--------------------------------------------------------------------------------
/course/videos/perplexity.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {
14 |     "cellView": "form"
15 |    },
16 |    "outputs": [
17 |     {
18 |      "data": {
19 |       "text/html": [
20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/NURcDHhYe98?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
21 |       ],
22 |       "text/plain": [
23 |        "<IPython.core.display.HTML object>"
24 |       ]
25 |      },
26 |      "execution_count": null,
27 |      "metadata": {},
28 |      "output_type": "execute_result"
29 |     }
30 |    ],
31 |    "source": [
32 |     "#@title\n",
33 |     "from IPython.display import HTML\n",
34 |     "\n",
35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/NURcDHhYe98?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
36 |    ]
37 |   },
38 |   {
39 |    "cell_type": "markdown",
40 |    "metadata": {},
41 |    "source": [
42 |     "Install the Transformers and Datasets libraries to run this notebook."
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "code",
47 |    "execution_count": null,
48 |    "metadata": {},
49 |    "outputs": [],
50 |    "source": [
51 |     "! pip install datasets transformers[sentencepiece]"
52 |    ]
53 |   },
54 |   {
55 |    "cell_type": "code",
56 |    "execution_count": null,
57 |    "metadata": {},
58 |    "outputs": [],
59 |    "source": [
60 |     "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
61 |     "import torch\n",
62 |     "\n",
63 |     "model = AutoModelForCausalLM.from_pretrained(\"gpt2\")\n",
64 |     "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
65 |     "\n",
66 |     "inputs = tokenizer(\"Hugging Face is a startup based in New York City and Paris\",\n",
67 |     "                   return_tensors=\"pt\")\n",
68 |     "\n",
69 |     "loss = model(input_ids=inputs[\"input_ids\"],\n",
70 |     "             labels=inputs[\"input_ids\"]).loss\n",
71 |     "\n",
72 |     "ppl = torch.exp(loss)\n",
73 |     "\n",
74 |     "print(f\"Perplexity: {ppl.item():.2f}\")"
75 |    ]
76 |   },
77 |   {
78 |    "cell_type": "code",
79 |    "execution_count": null,
80 |    "metadata": {},
81 |    "outputs": [],
82 |    "source": []
83 |   }
84 |  ],
85 |  "metadata": {
86 |   "colab": {
87 |    "name": "What is perplexity?",
88 |    "provenance": []
89 |   }
90 |  },
91 |  "nbformat": 4,
92 |  "nbformat_minor": 4
93 | }
94 | 


--------------------------------------------------------------------------------
/course/videos/normalization.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {
14 |     "cellView": "form"
15 |    },
16 |    "outputs": [
17 |     {
18 |      "data": {
19 |       "text/html": [
20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/4IIC2jI9CaU?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
21 |       ],
22 |       "text/plain": [
23 |        "<IPython.core.display.HTML object>"
24 |       ]
25 |      },
26 |      "execution_count": null,
27 |      "metadata": {},
28 |      "output_type": "execute_result"
29 |     }
30 |    ],
31 |    "source": [
32 |     "#@title\n",
33 |     "from IPython.display import HTML\n",
34 |     "\n",
35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/4IIC2jI9CaU?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
36 |    ]
37 |   },
38 |   {
39 |    "cell_type": "markdown",
40 |    "metadata": {},
41 |    "source": [
42 |     "Install the Transformers and Datasets libraries to run this notebook."
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "code",
47 |    "execution_count": null,
48 |    "metadata": {},
49 |    "outputs": [],
50 |    "source": [
51 |     "! pip install datasets transformers[sentencepiece]"
52 |    ]
53 |   },
54 |   {
55 |    "cell_type": "code",
56 |    "execution_count": null,
57 |    "metadata": {},
58 |    "outputs": [],
59 |    "source": [
60 |     "from transformers import AutoTokenizer\n",
61 |     "\n",
62 |     "text = \"This is a text with àccënts and CAPITAL LETTERS\"\n",
63 |     "\n",
64 |     "tokenizer = AutoTokenizer.from_pretrained(\"albert-large-v2\")\n",
65 |     "print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))\n",
66 |     "\n",
67 |     "tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/albert-tokenizer-without-normalizer\")\n",
68 |     "print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))"
69 |    ]
70 |   },
71 |   {
72 |    "cell_type": "code",
73 |    "execution_count": null,
74 |    "metadata": {},
75 |    "outputs": [],
76 |    "source": [
77 |     "text = \"un père indigné\"\n",
78 |     "\n",
79 |     "tokenizer = AutoTokenizerFast.from_pretrained('distilbert-base-uncased')\n",
80 |     "print(tokenizer.backend_tokenizer.normalizer.normalize_str(text))"
81 |    ]
82 |   }
83 |  ],
84 |  "metadata": {
85 |   "colab": {
86 |    "name": "What is normalization?",
87 |    "provenance": []
88 |   }
89 |  },
90 |  "nbformat": 4,
91 |  "nbformat_minor": 4
92 | }
93 | 


--------------------------------------------------------------------------------
/course/chapter4/section2_pt.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Using pretrained models (PyTorch)"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "Install the Transformers and Datasets libraries to run this notebook."
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "code",
19 |    "execution_count": null,
20 |    "metadata": {},
21 |    "outputs": [],
22 |    "source": [
23 |     "!pip install datasets transformers[sentencepiece]"
24 |    ]
25 |   },
26 |   {
27 |    "cell_type": "code",
28 |    "execution_count": null,
29 |    "metadata": {},
30 |    "outputs": [
31 |     {
32 |      "data": {
33 |       "text/plain": [
34 |        "[\n",
35 |        "  {'sequence': 'Le camembert est délicieux :)', 'score': 0.49091005325317383, 'token': 7200, 'token_str': 'délicieux'}, \n",
36 |        "  {'sequence': 'Le camembert est excellent :)', 'score': 0.1055697426199913, 'token': 2183, 'token_str': 'excellent'}, \n",
37 |        "  {'sequence': 'Le camembert est succulent :)', 'score': 0.03453313186764717, 'token': 26202, 'token_str': 'succulent'}, \n",
38 |        "  {'sequence': 'Le camembert est meilleur :)', 'score': 0.0330314114689827, 'token': 528, 'token_str': 'meilleur'}, \n",
39 |        "  {'sequence': 'Le camembert est parfait :)', 'score': 0.03007650189101696, 'token': 1654, 'token_str': 'parfait'}\n",
40 |        "]"
41 |       ]
42 |      },
43 |      "execution_count": null,
44 |      "metadata": {},
45 |      "output_type": "execute_result"
46 |     }
47 |    ],
48 |    "source": [
49 |     "from transformers import pipeline\n",
50 |     "\n",
51 |     "camembert_fill_mask = pipeline(\"fill-mask\", model=\"camembert-base\")\n",
52 |     "results = camembert_fill_mask(\"Le camembert est <mask> :)\")"
53 |    ]
54 |   },
55 |   {
56 |    "cell_type": "code",
57 |    "execution_count": null,
58 |    "metadata": {},
59 |    "outputs": [],
60 |    "source": [
61 |     "from transformers import CamembertTokenizer, CamembertForMaskedLM\n",
62 |     "\n",
63 |     "tokenizer = CamembertTokenizer.from_pretrained(\"camembert-base\")\n",
64 |     "model = CamembertForMaskedLM.from_pretrained(\"camembert-base\")"
65 |    ]
66 |   },
67 |   {
68 |    "cell_type": "code",
69 |    "execution_count": null,
70 |    "metadata": {},
71 |    "outputs": [],
72 |    "source": [
73 |     "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
74 |     "\n",
75 |     "tokenizer = AutoTokenizer.from_pretrained(\"camembert-base\")\n",
76 |     "model = AutoModelForMaskedLM.from_pretrained(\"camembert-base\")"
77 |    ]
78 |   }
79 |  ],
80 |  "metadata": {
81 |   "colab": {
82 |    "name": "Using pretrained models (PyTorch)",
83 |    "provenance": []
84 |   }
85 |  },
86 |  "nbformat": 4,
87 |  "nbformat_minor": 4
88 | }
89 | 


--------------------------------------------------------------------------------
/course/chapter4/section2_tf.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Using pretrained models (TensorFlow)"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "Install the Transformers and Datasets libraries to run this notebook."
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "code",
19 |    "execution_count": null,
20 |    "metadata": {},
21 |    "outputs": [],
22 |    "source": [
23 |     "!pip install datasets transformers[sentencepiece]"
24 |    ]
25 |   },
26 |   {
27 |    "cell_type": "code",
28 |    "execution_count": null,
29 |    "metadata": {},
30 |    "outputs": [
31 |     {
32 |      "data": {
33 |       "text/plain": [
34 |        "[\n",
35 |        "  {'sequence': 'Le camembert est délicieux :)', 'score': 0.49091005325317383, 'token': 7200, 'token_str': 'délicieux'}, \n",
36 |        "  {'sequence': 'Le camembert est excellent :)', 'score': 0.1055697426199913, 'token': 2183, 'token_str': 'excellent'}, \n",
37 |        "  {'sequence': 'Le camembert est succulent :)', 'score': 0.03453313186764717, 'token': 26202, 'token_str': 'succulent'}, \n",
38 |        "  {'sequence': 'Le camembert est meilleur :)', 'score': 0.0330314114689827, 'token': 528, 'token_str': 'meilleur'}, \n",
39 |        "  {'sequence': 'Le camembert est parfait :)', 'score': 0.03007650189101696, 'token': 1654, 'token_str': 'parfait'}\n",
40 |        "]"
41 |       ]
42 |      },
43 |      "execution_count": null,
44 |      "metadata": {},
45 |      "output_type": "execute_result"
46 |     }
47 |    ],
48 |    "source": [
49 |     "from transformers import pipeline\n",
50 |     "\n",
51 |     "camembert_fill_mask = pipeline(\"fill-mask\", model=\"camembert-base\")\n",
52 |     "results = camembert_fill_mask(\"Le camembert est <mask> :)\")"
53 |    ]
54 |   },
55 |   {
56 |    "cell_type": "code",
57 |    "execution_count": null,
58 |    "metadata": {},
59 |    "outputs": [],
60 |    "source": [
61 |     "from transformers import CamembertTokenizer, TFCamembertForMaskedLM\n",
62 |     "\n",
63 |     "tokenizer = CamembertTokenizer.from_pretrained(\"camembert-base\")\n",
64 |     "model = TFCamembertForMaskedLM.from_pretrained(\"camembert-base\")"
65 |    ]
66 |   },
67 |   {
68 |    "cell_type": "code",
69 |    "execution_count": null,
70 |    "metadata": {},
71 |    "outputs": [],
72 |    "source": [
73 |     "from transformers import AutoTokenizer, TFAutoModelForMaskedLM\n",
74 |     "\n",
75 |     "tokenizer = AutoTokenizer.from_pretrained(\"camembert-base\")\n",
76 |     "model = TFAutoModelForMaskedLM.from_pretrained(\"camembert-base\")"
77 |    ]
78 |   }
79 |  ],
80 |  "metadata": {
81 |   "colab": {
82 |    "name": "Using pretrained models (TensorFlow)",
83 |    "provenance": []
84 |   }
85 |  },
86 |  "nbformat": 4,
87 |  "nbformat_minor": 4
88 | }
89 | 


--------------------------------------------------------------------------------
/course/videos/offset_mapping.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/3umI3tm27Vw?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/3umI3tm27Vw?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from transformers import AutoTokenizer\n",
 61 |     "\n",
 62 |     "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
 63 |     "print(tokenizer(\"Let's talk about tokenizers superpowers.\")[\"input_ids\"])\n",
 64 |     "print(tokenizer(\"Let's talk about tokenizers      superpowers.\")[\"input_ids\"])"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "encoding = tokenizer(\"Let's talk about tokenizers superpowers.\")\n",
 74 |     "print(encoding.tokens())\n",
 75 |     "print(encoding.word_ids())"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "encoding = tokenizer(\n",
 85 |     "    \"Let's talk about tokenizers     superpowers.\",\n",
 86 |     "    return_offsets_mapping=True\n",
 87 |     ")\n",
 88 |     "print(encoding.tokens())\n",
 89 |     "print(encoding[\"offset_mapping\"])"
 90 |    ]
 91 |   }
 92 |  ],
 93 |  "metadata": {
 94 |   "colab": {
 95 |    "name": "Fast tokenizer superpowers",
 96 |    "provenance": []
 97 |   }
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 4
101 | }
102 | 


--------------------------------------------------------------------------------
/course/videos/domain_adaptation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/0Oxphw4Q9fo?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/0Oxphw4Q9fo?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from transformers import pipeline"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "model_checkpoint = \"distilbert-base-uncased\"\n",
 70 |     "fill_masker = pipeline(\"fill-mask\", model=model_checkpoint)\n",
 71 |     "fill_masker(\"This is a great [MASK].\")"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "model_checkpoint = \"huggingface-course/distilbert-base-uncased-finetuned-imdb\"\n",
 81 |     "fill_masker = pipeline(\"fill-mask\", model=model_checkpoint)\n",
 82 |     "fill_masker(\"This is a great [MASK].\")"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n",
 92 |     "translator = pipeline(\"translation\", model=model_checkpoint)\n",
 93 |     "translator(\"This plugin automatically translates emails.\")"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "model_checkpoint = \"huggingface-course/marian-finetuned-kde4-en-to-fr\")\n",
103 |     "translator = pipeline(\"translation\", model=model_checkpoint)\n",
104 |     "translator(\"This plugin automatically translates emails.\")"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": []
113 |   }
114 |  ],
115 |  "metadata": {
116 |   "colab": {
117 |    "name": "What is domain adaptation?",
118 |    "provenance": []
119 |   }
120 |  },
121 |  "nbformat": 4,
122 |  "nbformat_minor": 4
123 | }
124 | 


--------------------------------------------------------------------------------
/course/chapter2/section3_pt.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Models (PyTorch)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Install the Transformers and Datasets libraries to run this notebook."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "!pip install datasets transformers[sentencepiece]"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from transformers import BertConfig, BertModel\n",
 33 |     "\n",
 34 |     "# Building the config\n",
 35 |     "config = BertConfig()\n",
 36 |     "\n",
 37 |     "# Building the model from the config\n",
 38 |     "model = BertModel(config)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "BertConfig {\n",
 50 |        "  [...]\n",
 51 |        "  \"hidden_size\": 768,\n",
 52 |        "  \"intermediate_size\": 3072,\n",
 53 |        "  \"max_position_embeddings\": 512,\n",
 54 |        "  \"num_attention_heads\": 12,\n",
 55 |        "  \"num_hidden_layers\": 12,\n",
 56 |        "  [...]\n",
 57 |        "}"
 58 |       ]
 59 |      },
 60 |      "execution_count": null,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "print(config)"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "from transformers import BertConfig, BertModel\n",
 76 |     "\n",
 77 |     "config = BertConfig()\n",
 78 |     "model = BertModel(config)\n",
 79 |     "\n",
 80 |     "# Model is randomly initialized!"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "from transformers import BertModel\n",
 90 |     "\n",
 91 |     "model = BertModel.from_pretrained(\"bert-base-cased\")"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "model.save_pretrained(\"directory_on_my_computer\")"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "sequences = [\"Hello!\", \"Cool.\", \"Nice!\"]"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "encoded_sequences = [\n",
119 |     "    [101, 7592, 999, 102],\n",
120 |     "    [101, 4658, 1012, 102],\n",
121 |     "    [101, 3835, 999, 102],\n",
122 |     "]"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "import torch\n",
132 |     "\n",
133 |     "model_inputs = torch.tensor(encoded_sequences)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "output = model(model_inputs)"
143 |    ]
144 |   }
145 |  ],
146 |  "metadata": {
147 |   "colab": {
148 |    "name": "Models (PyTorch)",
149 |    "provenance": []
150 |   }
151 |  },
152 |  "nbformat": 4,
153 |  "nbformat_minor": 4
154 | }
155 | 


--------------------------------------------------------------------------------
/course/chapter6/section4.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Normalization and pre-tokenization"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Install the Transformers and Datasets libraries to run this notebook."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "!pip install datasets transformers[sentencepiece]"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "data": {
 33 |       "text/plain": [
 34 |        "<class 'tokenizers.Tokenizer'>"
 35 |       ]
 36 |      },
 37 |      "execution_count": null,
 38 |      "metadata": {},
 39 |      "output_type": "execute_result"
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "from transformers import AutoTokenizer\n",
 44 |     "\n",
 45 |     "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
 46 |     "print(type(tokenizer.backend_tokenizer))"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "'hello how are u?'"
 58 |       ]
 59 |      },
 60 |      "execution_count": null,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "print(tokenizer.backend_tokenizer.normalizer.normalize_str(\"Héllò hôw are ü?\"))"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "data": {
 76 |       "text/plain": [
 77 |        "[('Hello', (0, 5)), (',', (5, 6)), ('how', (7, 10)), ('are', (11, 14)), ('you', (16, 19)), ('?', (19, 20))]"
 78 |       ]
 79 |      },
 80 |      "execution_count": null,
 81 |      "metadata": {},
 82 |      "output_type": "execute_result"
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Hello, how are  you?\")"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "data": {
 96 |       "text/plain": [
 97 |        "[('Hello', (0, 5)), (',', (5, 6)), ('Ġhow', (6, 10)), ('Ġare', (10, 14)), ('Ġ', (14, 15)), ('Ġyou', (15, 19)),\n",
 98 |        " ('?', (19, 20))]"
 99 |       ]
100 |      },
101 |      "execution_count": null,
102 |      "metadata": {},
103 |      "output_type": "execute_result"
104 |     }
105 |    ],
106 |    "source": [
107 |     "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
108 |     "tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Hello, how are  you?\")"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [
116 |     {
117 |      "data": {
118 |       "text/plain": [
119 |        "[('▁Hello,', (0, 6)), ('▁how', (7, 10)), ('▁are', (11, 14)), ('▁you?', (16, 20))]"
120 |       ]
121 |      },
122 |      "execution_count": null,
123 |      "metadata": {},
124 |      "output_type": "execute_result"
125 |     }
126 |    ],
127 |    "source": [
128 |     "tokenizer = AutoTokenizer.from_pretrained(\"t5-small\")\n",
129 |     "tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Hello, how are  you?\")"
130 |    ]
131 |   }
132 |  ],
133 |  "metadata": {
134 |   "colab": {
135 |    "name": "Normalization and pre-tokenization",
136 |    "provenance": []
137 |   }
138 |  },
139 |  "nbformat": 4,
140 |  "nbformat_minor": 4
141 | }
142 | 


--------------------------------------------------------------------------------
/course/chapter2/section3_tf.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Models (TensorFlow)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Install the Transformers and Datasets libraries to run this notebook."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "!pip install datasets transformers[sentencepiece]"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from transformers import BertConfig, TFBertModel\n",
 33 |     "\n",
 34 |     "# Building the config\n",
 35 |     "config = BertConfig()\n",
 36 |     "\n",
 37 |     "# Building the model from the config\n",
 38 |     "model = TFBertModel(config)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "BertConfig {\n",
 50 |        "  [...]\n",
 51 |        "  \"hidden_size\": 768,\n",
 52 |        "  \"intermediate_size\": 3072,\n",
 53 |        "  \"max_position_embeddings\": 512,\n",
 54 |        "  \"num_attention_heads\": 12,\n",
 55 |        "  \"num_hidden_layers\": 12,\n",
 56 |        "  [...]\n",
 57 |        "}"
 58 |       ]
 59 |      },
 60 |      "execution_count": null,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "print(config)"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "from transformers import BertConfig, TFBertModel\n",
 76 |     "\n",
 77 |     "config = BertConfig()\n",
 78 |     "model = TFBertModel(config)\n",
 79 |     "\n",
 80 |     "# Model is randomly initialized!"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "from transformers import TFBertModel\n",
 90 |     "\n",
 91 |     "model = TFBertModel.from_pretrained(\"bert-base-cased\")"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "model.save_pretrained(\"directory_on_my_computer\")"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "sequences = [\"Hello!\", \"Cool.\", \"Nice!\"]"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "encoded_sequences = [\n",
119 |     "    [101, 7592, 999, 102],\n",
120 |     "    [101, 4658, 1012, 102],\n",
121 |     "    [101, 3835, 999, 102],\n",
122 |     "]"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "import tensorflow as tf\n",
132 |     "\n",
133 |     "model_inputs = tf.constant(encoded_sequences)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "output = model(model_inputs)"
143 |    ]
144 |   }
145 |  ],
146 |  "metadata": {
147 |   "colab": {
148 |    "name": "Models (TensorFlow)",
149 |    "provenance": []
150 |   }
151 |  },
152 |  "nbformat": 4,
153 |  "nbformat_minor": 4
154 | }
155 | 


--------------------------------------------------------------------------------
/sagemaker/01_getting_started_pytorch/scripts/train.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
 2 | from sklearn.metrics import accuracy_score, precision_recall_fscore_support
 3 | from datasets import load_from_disk
 4 | import random
 5 | import logging
 6 | import sys
 7 | import argparse
 8 | import os
 9 | import torch
10 | 
11 | if __name__ == "__main__":
12 | 
13 |     parser = argparse.ArgumentParser()
14 | 
15 |     # hyperparameters sent by the client are passed as command-line arguments to the script.
16 |     parser.add_argument("--epochs", type=int, default=3)
17 |     parser.add_argument("--train_batch_size", type=int, default=32)
18 |     parser.add_argument("--eval_batch_size", type=int, default=64)
19 |     parser.add_argument("--warmup_steps", type=int, default=500)
20 |     parser.add_argument("--model_name", type=str)
21 |     parser.add_argument("--learning_rate", type=str, default=5e-5)
22 | 
23 |     # Data, model, and output directories
24 |     parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
25 |     parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
26 |     parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
27 |     parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
28 |     parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
29 | 
30 |     args, _ = parser.parse_known_args()
31 | 
32 |     # Set up logging
33 |     logger = logging.getLogger(__name__)
34 | 
35 |     logging.basicConfig(
36 |         level=logging.getLevelName("INFO"),
37 |         handlers=[logging.StreamHandler(sys.stdout)],
38 |         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
39 |     )
40 | 
41 |     # load datasets
42 |     train_dataset = load_from_disk(args.training_dir)
43 |     test_dataset = load_from_disk(args.test_dir)
44 | 
45 |     logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
46 |     logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
47 | 
48 |     # compute metrics function for binary classification
49 |     def compute_metrics(pred):
50 |         labels = pred.label_ids
51 |         preds = pred.predictions.argmax(-1)
52 |         precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
53 |         acc = accuracy_score(labels, preds)
54 |         return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
55 | 
56 |     # download model from model hub
57 |     model = AutoModelForSequenceClassification.from_pretrained(args.model_name)
58 |     tokenizer = AutoTokenizer.from_pretrained(args.model_name)
59 | 
60 |     # define training args
61 |     training_args = TrainingArguments(
62 |         output_dir=args.model_dir,
63 |         num_train_epochs=args.epochs,
64 |         per_device_train_batch_size=args.train_batch_size,
65 |         per_device_eval_batch_size=args.eval_batch_size,
66 |         warmup_steps=args.warmup_steps,
67 |         evaluation_strategy="epoch",
68 |         logging_dir=f"{args.output_data_dir}/logs",
69 |         learning_rate=float(args.learning_rate),
70 |     )
71 | 
72 |     # create Trainer instance
73 |     trainer = Trainer(
74 |         model=model,
75 |         args=training_args,
76 |         compute_metrics=compute_metrics,
77 |         train_dataset=train_dataset,
78 |         eval_dataset=test_dataset,
79 |         tokenizer=tokenizer,
80 |     )
81 | 
82 |     # train model
83 |     trainer.train()
84 | 
85 |     # evaluate model
86 |     eval_result = trainer.evaluate(eval_dataset=test_dataset)
87 | 
88 |     # writes eval result to file which can be accessed later in s3 ouput
89 |     with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer:
90 |         print(f"***** Eval results *****")
91 |         for key, value in sorted(eval_result.items()):
92 |             writer.write(f"{key} = {value}\n")
93 | 
94 |     # Saves the model to s3
95 |     trainer.save_model(args.model_dir)
96 | 


--------------------------------------------------------------------------------
/sagemaker/06_sagemaker_metrics/scripts/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import os
 4 | import random
 5 | import sys
 6 | 
 7 | from datasets import load_from_disk
 8 | from sklearn.metrics import accuracy_score, precision_recall_fscore_support
 9 | import torch
10 | from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
11 | 
12 | 
13 | if __name__ == "__main__":
14 | 
15 |     parser = argparse.ArgumentParser()
16 | 
17 |     # hyperparameters sent by the client are passed as command-line arguments to the script.
18 |     parser.add_argument("--epochs", type=int, default=3)
19 |     parser.add_argument("--train_batch_size", type=int, default=32)
20 |     parser.add_argument("--eval_batch_size", type=int, default=64)
21 |     parser.add_argument("--warmup_steps", type=int, default=500)
22 |     parser.add_argument("--model_name", type=str)
23 |     parser.add_argument("--learning_rate", type=float, default=5e-5)
24 | 
25 |     # Data, model, and output directories
26 |     parser.add_argument("--checkpoints", type=str, default="/opt/ml/checkpoints/")
27 |     parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
28 |     parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
29 |     parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
30 |     parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
31 | 
32 |     args, _ = parser.parse_known_args()
33 | 
34 |     # Set up logging
35 |     logger = logging.getLogger(__name__)
36 | 
37 |     logging.basicConfig(
38 |         level=logging.getLevelName("INFO"),
39 |         handlers=[logging.StreamHandler(sys.stdout)],
40 |         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
41 |     )
42 | 
43 |     # load datasets
44 |     train_dataset = load_from_disk(args.training_dir)
45 |     test_dataset = load_from_disk(args.test_dir)
46 | 
47 |     logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
48 |     logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
49 | 
50 |     # compute metrics function for binary classification
51 |     def compute_metrics(pred):
52 |         labels = pred.label_ids
53 |         preds = pred.predictions.argmax(-1)
54 |         precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
55 |         acc = accuracy_score(labels, preds)
56 |         return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
57 | 
58 |     # download model from model hub
59 |     model = AutoModelForSequenceClassification.from_pretrained(args.model_name)
60 |     tokenizer = AutoTokenizer.from_pretrained(args.model_name)
61 | 
62 |     # define training args
63 |     training_args = TrainingArguments(
64 |         output_dir=args.checkpoints,
65 |         num_train_epochs=args.epochs,
66 |         per_device_train_batch_size=args.train_batch_size,
67 |         per_device_eval_batch_size=args.eval_batch_size,
68 |         warmup_steps=args.warmup_steps,
69 |         evaluation_strategy="epoch",
70 |         logging_dir=f"{args.checkpoints}/logs",
71 |         learning_rate=args.learning_rate,
72 |     )
73 | 
74 |     # create Trainer instance
75 |     trainer = Trainer(
76 |         model=model,
77 |         args=training_args,
78 |         compute_metrics=compute_metrics,
79 |         train_dataset=train_dataset,
80 |         eval_dataset=test_dataset,
81 |         tokenizer=tokenizer,
82 |     )
83 | 
84 |     # train model
85 |     trainer.train()
86 | 
87 |     # evaluate model
88 |     eval_result = trainer.evaluate(eval_dataset=test_dataset)
89 | 
90 |     # writes eval result to file which can be accessed later in s3 ouput
91 |     with open(os.path.join(args.checkpoints, "eval_results.txt"), "w") as writer:
92 |         print(f"***** Eval results *****")
93 |         for key, value in sorted(eval_result.items()):
94 |             writer.write(f"{key} = {value}\n")
95 | 
96 |     # Saves the model locally. In SageMaker, writing in /opt/ml/model sends it to S3
97 |     trainer.save_model(args.model_dir)
98 | 


--------------------------------------------------------------------------------
/course/videos/bleu_metric.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/M05L1DhFqcw?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/M05L1DhFqcw?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from datasets import load_metric\n",
 61 |     "\n",
 62 |     "bleu = load_metric(\"bleu\")\n",
 63 |     "predictions = [[\"I\", \"have\", \"thirty\", \"six\", \"years\"]]\n",
 64 |     "references = [\n",
 65 |     "    [[\"I\", \"am\", \"thirty\", \"six\", \"years\", \"old\"], [\"I\", \"am\", \"thirty\", \"six\"]]\n",
 66 |     "]\n",
 67 |     "bleu.compute(predictions=predictions, references=references)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "predictions = [[\"I\", \"have\", \"thirty\", \"six\", \"years\"]]\n",
 77 |     "references = [\n",
 78 |     "    [[\"I\", \"am\", \"thirty\", \"six\", \"years\", \"old\"], [\"I\", \"am\", \"thirty\", \"six\"]]\n",
 79 |     "]\n",
 80 |     "bleu.compute(predictions=predictions, references=references)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "predictions = [[\"I\", \"have\", \"thirty\", \"six\", \"years\"]]\n",
 90 |     "references = [\n",
 91 |     "    [[\"I\", \"am\", \"thirty\", \"six\", \"years\", \"old\"], [\"I\", \"am\", \"thirty\", \"six\"]]\n",
 92 |     "]\n",
 93 |     "bleu.compute(predictions=predictions, references=references)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "! pip install sacrebleu"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "sacrebleu = load_metric(\"sacrebleu\")\n",
112 |     "# SacreBLEU operates on raw text, not tokens\n",
113 |     "predictions = [\"I have thirty six years\"]\n",
114 |     "references = [[\"I am thirty six years old\", \"I am thirty six\"]]\n",
115 |     "sacrebleu.compute(predictions=predictions, references=references)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": []
124 |   }
125 |  ],
126 |  "metadata": {
127 |   "colab": {
128 |    "name": "What is the BLEU metric?",
129 |    "provenance": []
130 |   }
131 |  },
132 |  "nbformat": 4,
133 |  "nbformat_minor": 4
134 | }
135 | 


--------------------------------------------------------------------------------
/course/videos/datasets_and_dataframes.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/tfcY1067A5Q?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/tfcY1067A5Q?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from datasets import load_dataset\n",
 61 |     "\n",
 62 |     "dataset = load_dataset(\"swiss_judgment_prediction\", \"all_languages\", split=\"train\")\n",
 63 |     "dataset[0]"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "# Convert the output format to pandas.DataFrame\n",
 73 |     "dataset.set_format(\"pandas\")\n",
 74 |     "dataset[0]"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "dataset.__getitem__(0)\n",
 84 |     "\n",
 85 |     "dataset.set_format(\"pandas\")\n",
 86 |     "\n",
 87 |     "dataset.__getitem__(0)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "df = dataset.to_pandas()\n",
 97 |     "df.head()"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "# How are languages distributed across regions?\n",
107 |     "df.groupby(\"region\")[\"language\"].value_counts()\n",
108 |     "\n",
109 |     "# Which legal area is most common?\n",
110 |     "df[\"legal area\"].value_counts()"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "from transformers import AutoTokenizer\n",
120 |     "\n",
121 |     "# Load a pretrained tokenizer\n",
122 |     "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
123 |     "# Tokenize the `text` column\n",
124 |     "dataset.map(lambda x : tokenizer(x[\"text\"]))"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "# Reset back to Arrow format\n",
134 |     "dataset.reset_format()\n",
135 |     "# Now we can tokenize!\n",
136 |     "dataset.map(lambda x : tokenizer(x[\"text\"]))"
137 |    ]
138 |   }
139 |  ],
140 |  "metadata": {
141 |   "colab": {
142 |    "name": "Datasets + DataFrames = ❤️",
143 |    "provenance": []
144 |   }
145 |  },
146 |  "nbformat": 4,
147 |  "nbformat_minor": 4
148 | }
149 | 


--------------------------------------------------------------------------------
/course/videos/fast_tokenizers.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/g8quOxoqhHQ?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/g8quOxoqhHQ?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from datasets import load_dataset\n",
 61 |     "\n",
 62 |     "raw_datasets = load_dataset(\"glue\", \"mnli\")\n",
 63 |     "raw_datasets"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "from transformers import AutoTokenizer\n",
 73 |     "\n",
 74 |     "fast_tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
 75 |     "\n",
 76 |     "def tokenize_with_fast(examples):\n",
 77 |     "    return fast_tokenizer(\n",
 78 |     "        examples[\"premise\"], examples[\"hypothesis\"], truncation=True\n",
 79 |     "    )"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "slow_tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\", use_fast=False)\n",
 89 |     "\n",
 90 |     "def tokenize_with_slow(examples):\n",
 91 |     "    return fast_tokenizer(\n",
 92 |     "        examples[\"premise\"], examples[\"hypothesis\"], truncation=True\n",
 93 |     "    )"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "%time tokenized_datasets = raw_datasets.map(tokenize_with_fast)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "%time tokenized_datasets = raw_datasets.map(tokenize_with_slow)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "%time tokenized_datasets = raw_datasets.map(tokenize_with_fast, batched=True)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "%time tokenized_datasets = raw_datasets.map(tokenize_with_slow, batched=True)"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": []
138 |   }
139 |  ],
140 |  "metadata": {
141 |   "colab": {
142 |    "name": "Why are fast tokenizers called fast?",
143 |    "provenance": []
144 |   }
145 |  },
146 |  "nbformat": 4,
147 |  "nbformat_minor": 4
148 | }
149 | 


--------------------------------------------------------------------------------
/sagemaker/05_spot_instances/scripts/train.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
  2 | from transformers.trainer_utils import get_last_checkpoint
  3 | 
  4 | from sklearn.metrics import accuracy_score, precision_recall_fscore_support
  5 | from datasets import load_from_disk
  6 | import logging
  7 | import sys
  8 | import argparse
  9 | import os
 10 | 
 11 | # Set up logging
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | logging.basicConfig(
 15 |     level=logging.getLevelName("INFO"),
 16 |     handlers=[logging.StreamHandler(sys.stdout)],
 17 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 18 | )
 19 | 
 20 | if __name__ == "__main__":
 21 | 
 22 |     logger.info(sys.argv)
 23 | 
 24 |     parser = argparse.ArgumentParser()
 25 | 
 26 |     # hyperparameters sent by the client are passed as command-line arguments to the script.
 27 |     parser.add_argument("--epochs", type=int, default=3)
 28 |     parser.add_argument("--train_batch_size", type=int, default=32)
 29 |     parser.add_argument("--eval_batch_size", type=int, default=64)
 30 |     parser.add_argument("--warmup_steps", type=int, default=500)
 31 |     parser.add_argument("--model_name", type=str)
 32 |     parser.add_argument("--learning_rate", type=str, default=5e-5)
 33 |     parser.add_argument("--output_dir", type=str)
 34 | 
 35 |     # Data, model, and output directories
 36 |     parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
 37 |     parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
 38 |     parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
 39 |     parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
 40 |     parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
 41 | 
 42 |     args, _ = parser.parse_known_args()
 43 | 
 44 |     # load datasets
 45 |     train_dataset = load_from_disk(args.training_dir)
 46 |     test_dataset = load_from_disk(args.test_dir)
 47 | 
 48 |     logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
 49 |     logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
 50 | 
 51 |     # compute metrics function for binary classification
 52 |     def compute_metrics(pred):
 53 |         labels = pred.label_ids
 54 |         preds = pred.predictions.argmax(-1)
 55 |         precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
 56 |         acc = accuracy_score(labels, preds)
 57 |         return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
 58 | 
 59 |     # download model from model hub
 60 |     model = AutoModelForSequenceClassification.from_pretrained(args.model_name)
 61 |     tokenizer = AutoTokenizer.from_pretrained(args.model_name)
 62 | 
 63 |     # define training args
 64 |     training_args = TrainingArguments(
 65 |         output_dir=args.output_dir,
 66 |         num_train_epochs=args.epochs,
 67 |         per_device_train_batch_size=args.train_batch_size,
 68 |         per_device_eval_batch_size=args.eval_batch_size,
 69 |         warmup_steps=args.warmup_steps,
 70 |         evaluation_strategy="epoch",
 71 |         logging_dir=f"{args.output_data_dir}/logs",
 72 |         learning_rate=float(args.learning_rate),
 73 |     )
 74 | 
 75 |     # create Trainer instance
 76 |     trainer = Trainer(
 77 |         model=model,
 78 |         args=training_args,
 79 |         compute_metrics=compute_metrics,
 80 |         train_dataset=train_dataset,
 81 |         eval_dataset=test_dataset,
 82 |         tokenizer=tokenizer,
 83 |     )
 84 | 
 85 |     # train model
 86 |     if get_last_checkpoint(args.output_dir) is not None:
 87 |         logger.info("***** continue training *****")
 88 |         last_checkpoint = get_last_checkpoint(args.output_dir)
 89 |         trainer.train(resume_from_checkpoint=last_checkpoint)
 90 |     else:
 91 |         trainer.train()
 92 |     # evaluate model
 93 |     eval_result = trainer.evaluate(eval_dataset=test_dataset)
 94 | 
 95 |     # writes eval result to file which can be accessed later in s3 ouput
 96 |     with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer:
 97 |         print(f"***** Eval results *****")
 98 |         for key, value in sorted(eval_result.items()):
 99 |             writer.write(f"{key} = {value}\n")
100 | 
101 |     # Saves the model to s3
102 |     trainer.save_model(args.model_dir)
103 | 


--------------------------------------------------------------------------------
/course/videos/debug_error.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/DQ-CpJn6Rc4?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/DQ-CpJn6Rc4?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from transformers import pipeline\n",
 61 |     "\n",
 62 |     "model_checkpoint = \"distillbert-base-cased-distilled-squad\"\n",
 63 |     "question_answerer = pipeline(\"question_answering\", model=model_checkpoint)\n",
 64 |     "\n",
 65 |     "context = \"\"\"\n",
 66 |     "🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.\n",
 67 |     "\"\"\"\n",
 68 |     "question = \"Which deep learning libraries back 🤗 Transformers?\"\n",
 69 |     "question_answerer(question=question, context=context)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "from transformers import pipeline\n",
 79 |     "\n",
 80 |     "model_checkpoint = \"distillbert-base-cased-distilled-squad\"\n",
 81 |     "question_answerer = pipeline(\"question-answering\", model=model_checkpoint)\n",
 82 |     "\n",
 83 |     "context = \"\"\"\n",
 84 |     "🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.\n",
 85 |     "\"\"\"\n",
 86 |     "question = \"Which deep learning libraries back 🤗 Transformers?\"\n",
 87 |     "question_answerer(question=question, context=context)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "from transformers import pipeline\n",
 97 |     "\n",
 98 |     "model_checkpoint = \"distilbert-base-cased-distilled-squad\"\n",
 99 |     "question_answerer = pipeline(\"question-answering\", model=model_checkpoint)\n",
100 |     "\n",
101 |     "context = \"\"\"\n",
102 |     "🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.\n",
103 |     "\"\"\"\n",
104 |     "question = \"Which deep learning libraries back 🤗 Transformers?\"\n",
105 |     "question_answerer(question=question, context=context)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": []
114 |   }
115 |  ],
116 |  "metadata": {
117 |   "colab": {
118 |    "name": "What to do when you get an error?",
119 |    "provenance": []
120 |   }
121 |  },
122 |  "nbformat": 4,
123 |  "nbformat_minor": 4
124 | }
125 | 


--------------------------------------------------------------------------------
/course/videos/summarization_processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/1m7BerpSq8A?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/1m7BerpSq8A?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from datasets import load_dataset, load_metric\n",
 61 |     "\n",
 62 |     "raw_datasets = load_dataset(\"xsum\")\n",
 63 |     "raw_datasets = raw_datasets.remove_columns([\"id\"])\n",
 64 |     "raw_datasets[\"train\"]"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "print(raw_datasets[\"train\"][1])"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "from transformers import AutoTokenizer\n",
 83 |     "\n",
 84 |     "model_checkpoint = \"t5-small\"\n",
 85 |     "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
 86 |     "\n",
 87 |     "sample = raw_datasets[\"train\"][1]\n",
 88 |     "inputs = tokenizer(sample[\"document\"])\n",
 89 |     "with tokenizer.as_target_tokenizer():\n",
 90 |     "    targets = tokenizer(sample[\"summary\"])\n",
 91 |     "\n",
 92 |     "print(tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"]))\n",
 93 |     "print(tokenizer.convert_ids_to_tokens(targets[\"input_ids\"])"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "max_input_length = 1024\n",
103 |     "max_target_length = 128\n",
104 |     "\n",
105 |     "def preprocess_function(examples):\n",
106 |     "    model_inputs = tokenizer(examples[\"document\"], max_length=max_input_length, truncation=True)\n",
107 |     "\n",
108 |     "    # Setup the tokenizer for targets\n",
109 |     "    with tokenizer.as_target_tokenizer():\n",
110 |     "        labels = tokenizer(examples[\"summary\"], max_length=max_target_length, truncation=True)\n",
111 |     "\n",
112 |     "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
113 |     "    return model_inputs\n",
114 |     "\n",
115 |     "tokenized_datasets = raw_datasets.map(\n",
116 |     "    preprocess_function, batched=True, remove_columns=[\"document\", \"summary\"]\n",
117 |     ")"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "from transformers import DataCollatorForSeq2Seq\n",
127 |     "\n",
128 |     "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": []
137 |   }
138 |  ],
139 |  "metadata": {
140 |   "colab": {
141 |    "name": "Data processing for Summarization",
142 |    "provenance": []
143 |   }
144 |  },
145 |  "nbformat": 4,
146 |  "nbformat_minor": 4
147 | }
148 | 


--------------------------------------------------------------------------------
/course/chapter2/section4_pt.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Tokenizers (PyTorch)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Install the Transformers and Datasets libraries to run this notebook."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "!pip install datasets transformers[sentencepiece]"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "data": {
 33 |       "text/plain": [
 34 |        "['Jim', 'Henson', 'was', 'a', 'puppeteer']"
 35 |       ]
 36 |      },
 37 |      "execution_count": null,
 38 |      "metadata": {},
 39 |      "output_type": "execute_result"
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "tokenized_text = \"Jim Henson was a puppeteer\".split()\n",
 44 |     "print(tokenized_text)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from transformers import BertTokenizer\n",
 54 |     "\n",
 55 |     "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "from transformers import AutoTokenizer\n",
 65 |     "\n",
 66 |     "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "data": {
 76 |       "text/plain": [
 77 |        "{'input_ids': [101, 7993, 170, 11303, 1200, 2443, 1110, 3014, 102],\n",
 78 |        " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
 79 |        " 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}"
 80 |       ]
 81 |      },
 82 |      "execution_count": null,
 83 |      "metadata": {},
 84 |      "output_type": "execute_result"
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "tokenizer(\"Using a Transformer network is simple\")"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "tokenizer.save_pretrained(\"directory_on_my_computer\")"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "data": {
107 |       "text/plain": [
108 |        "['Using', 'a', 'transform', '##er', 'network', 'is', 'simple']"
109 |       ]
110 |      },
111 |      "execution_count": null,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "from transformers import AutoTokenizer\n",
118 |     "\n",
119 |     "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
120 |     "\n",
121 |     "sequence = \"Using a Transformer network is simple\"\n",
122 |     "tokens = tokenizer.tokenize(sequence)\n",
123 |     "\n",
124 |     "print(tokens)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [
132 |     {
133 |      "data": {
134 |       "text/plain": [
135 |        "[7993, 170, 11303, 1200, 2443, 1110, 3014]"
136 |       ]
137 |      },
138 |      "execution_count": null,
139 |      "metadata": {},
140 |      "output_type": "execute_result"
141 |     }
142 |    ],
143 |    "source": [
144 |     "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
145 |     "\n",
146 |     "print(ids)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [
154 |     {
155 |      "data": {
156 |       "text/plain": [
157 |        "'Using a Transformer network is simple'"
158 |       ]
159 |      },
160 |      "execution_count": null,
161 |      "metadata": {},
162 |      "output_type": "execute_result"
163 |     }
164 |    ],
165 |    "source": [
166 |     "decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])\n",
167 |     "print(decoded_string)"
168 |    ]
169 |   }
170 |  ],
171 |  "metadata": {
172 |   "colab": {
173 |    "name": "Tokenizers (PyTorch)",
174 |    "provenance": []
175 |   }
176 |  },
177 |  "nbformat": 4,
178 |  "nbformat_minor": 4
179 | }
180 | 


--------------------------------------------------------------------------------
/course/chapter2/section4_tf.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Tokenizers (TensorFlow)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Install the Transformers and Datasets libraries to run this notebook."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "!pip install datasets transformers[sentencepiece]"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "data": {
 33 |       "text/plain": [
 34 |        "['Jim', 'Henson', 'was', 'a', 'puppeteer']"
 35 |       ]
 36 |      },
 37 |      "execution_count": null,
 38 |      "metadata": {},
 39 |      "output_type": "execute_result"
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "tokenized_text = \"Jim Henson was a puppeteer\".split()\n",
 44 |     "print(tokenized_text)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from transformers import BertTokenizer\n",
 54 |     "\n",
 55 |     "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "from transformers import AutoTokenizer\n",
 65 |     "\n",
 66 |     "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "data": {
 76 |       "text/plain": [
 77 |        "{'input_ids': [101, 7993, 170, 11303, 1200, 2443, 1110, 3014, 102],\n",
 78 |        " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
 79 |        " 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}"
 80 |       ]
 81 |      },
 82 |      "execution_count": null,
 83 |      "metadata": {},
 84 |      "output_type": "execute_result"
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "tokenizer(\"Using a Transformer network is simple\")"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "tokenizer.save_pretrained(\"directory_on_my_computer\")"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "data": {
107 |       "text/plain": [
108 |        "['Using', 'a', 'transform', '##er', 'network', 'is', 'simple']"
109 |       ]
110 |      },
111 |      "execution_count": null,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "from transformers import AutoTokenizer\n",
118 |     "\n",
119 |     "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
120 |     "\n",
121 |     "sequence = \"Using a Transformer network is simple\"\n",
122 |     "tokens = tokenizer.tokenize(sequence)\n",
123 |     "\n",
124 |     "print(tokens)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [
132 |     {
133 |      "data": {
134 |       "text/plain": [
135 |        "[7993, 170, 11303, 1200, 2443, 1110, 3014]"
136 |       ]
137 |      },
138 |      "execution_count": null,
139 |      "metadata": {},
140 |      "output_type": "execute_result"
141 |     }
142 |    ],
143 |    "source": [
144 |     "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
145 |     "\n",
146 |     "print(ids)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [
154 |     {
155 |      "data": {
156 |       "text/plain": [
157 |        "'Using a Transformer network is simple'"
158 |       ]
159 |      },
160 |      "execution_count": null,
161 |      "metadata": {},
162 |      "output_type": "execute_result"
163 |     }
164 |    ],
165 |    "source": [
166 |     "decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])\n",
167 |     "print(decoded_string)"
168 |    ]
169 |   }
170 |  ],
171 |  "metadata": {
172 |   "colab": {
173 |    "name": "Tokenizers (TensorFlow)",
174 |    "provenance": []
175 |   }
176 |  },
177 |  "nbformat": 4,
178 |  "nbformat_minor": 4
179 | }
180 | 


--------------------------------------------------------------------------------
/course/videos/clm_processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/ma1TrR7gE7I?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/ma1TrR7gE7I?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
 61 |     "from datasets import load_dataset, DatasetDict\n",
 62 |     "\n",
 63 |     "ds_train = load_dataset(\"huggingface-course/codeparrot-ds-train\", split=\"train\")\n",
 64 |     "ds_valid = load_dataset(\"huggingface-course/codeparrot-ds-valid\", split=\"train\")\n",
 65 |     "\n",
 66 |     "raw_datasets = DatasetDict(\n",
 67 |     "    {\n",
 68 |     "        \"train\": ds_train,\n",
 69 |     "        \"valid\": ds_valid,\n",
 70 |     "    }\n",
 71 |     ")\n",
 72 |     "\n",
 73 |     "tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/code-search-net-tokenizer\")\n",
 74 |     "model = AutoModelForCausalLM.from_pretrained(\"huggingface-course/codeparrot-ds\")\n",
 75 |     "batch = tokenizer([\"import numpy as np\"], return_tensors=\"pt\")\n",
 76 |     "\n",
 77 |     "text = \"import numpy as np\\n\"*20\n",
 78 |     "context_length = 128"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "outputs = tokenizer(\n",
 88 |     "        text,\n",
 89 |     "        truncation=True,\n",
 90 |     "        max_length=16,\n",
 91 |     "        return_overflowing_tokens=True,\n",
 92 |     "        return_length=True,\n",
 93 |     "    )\n",
 94 |     "\n",
 95 |     "print(f\"Input chunk lengths: {(outputs['length'])}\")"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "def tokenize(element):\n",
105 |     "    outputs = tokenizer(\n",
106 |     "        element[\"content\"],\n",
107 |     "        truncation=True,\n",
108 |     "        max_length=context_length,\n",
109 |     "        return_overflowing_tokens=True,\n",
110 |     "        return_length=True,\n",
111 |     "    )\n",
112 |     "    input_batch = []\n",
113 |     "    for length, input_ids in zip(outputs[\"length\"], outputs[\"input_ids\"]):\n",
114 |     "        if length == context_length:\n",
115 |     "            input_batch.append(input_ids)\n",
116 |     "    return {\"input_ids\": input_batch}\n",
117 |     "\n",
118 |     "\n",
119 |     "tokenized_datasets = raw_datasets.map(\n",
120 |     "    tokenize, batched=True, remove_columns=raw_datasets[\"train\"].column_names\n",
121 |     ")"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "output = model(input_ids=batch[\"input_ids\"], labels=batch[\"input_ids\"])\n",
131 |     "loss = output.loss"
132 |    ]
133 |   }
134 |  ],
135 |  "metadata": {
136 |   "colab": {
137 |    "name": "Data processing for Causal Language Modeling",
138 |    "provenance": []
139 |   }
140 |  },
141 |  "nbformat": 4,
142 |  "nbformat_minor": 4
143 | }
144 | 


--------------------------------------------------------------------------------
/course/videos/load_custom_dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/HyQgpJTkRdE?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/HyQgpJTkRdE?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "from datasets import load_dataset\n",
 70 |     "\n",
 71 |     "local_csv_dataset = load_dataset(\"csv\", data_files=\"winequality-white.csv\", sep=\";\")\n",
 72 |     "local_csv_dataset[\"train\"]"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "# Load the dataset from the URL directly\n",
 82 |     "dataset_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\"\n",
 83 |     "remote_csv_dataset = load_dataset(\"csv\", data_files=dataset_url, sep=\";\")\n",
 84 |     "remote_csv_dataset"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "dataset_url = \"https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt\"\n",
 94 |     "text_dataset = load_dataset(\"text\", data_files=dataset_url)\n",
 95 |     "text_dataset[\"train\"][:5]"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "dataset_url = \"https://raw.githubusercontent.com/hirupert/sede/main/data/sede/train.jsonl\"\n",
105 |     "json_lines_dataset = load_dataset(\"json\", data_files=dataset_url)\n",
106 |     "json_lines_dataset[\"train\"][:2]"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "dataset_url = \"https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json\"\n",
116 |     "json_dataset = load_dataset(\"json\", data_files=dataset_url, field=\"data\")\n",
117 |     "json_dataset"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "url = \"https://rajpurkar.github.io/SQuAD-explorer/dataset/\"\n",
127 |     "data_files = {\"train\": f\"{url}train-v2.0.json\", \"validation\": f\"{url}dev-v2.0.json\"}\n",
128 |     "json_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")\n",
129 |     "json_dataset"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": []
138 |   }
139 |  ],
140 |  "metadata": {
141 |   "colab": {
142 |    "name": "Loading a custom dataset",
143 |    "provenance": []
144 |   }
145 |  },
146 |  "nbformat": 4,
147 |  "nbformat_minor": 4
148 | }
149 | 


--------------------------------------------------------------------------------
/course/videos/debug_training_tf.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/N9kO52itd0Q?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/N9kO52itd0Q?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from datasets import load_dataset, load_metric\n",
 61 |     "from transformers import (\n",
 62 |     "    AutoTokenizer,\n",
 63 |     "    TFAutoModelForSequenceClassification,\n",
 64 |     ")\n",
 65 |     "\n",
 66 |     "raw_datasets = load_dataset(\"glue\", \"mnli\")\n",
 67 |     "\n",
 68 |     "model_checkpoint = \"distilbert-base-uncased\"\n",
 69 |     "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
 70 |     "\n",
 71 |     "\n",
 72 |     "def preprocess_function(examples):\n",
 73 |     "    return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True)\n",
 74 |     "\n",
 75 |     "\n",
 76 |     "tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)\n",
 77 |     "model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "train_dataset = tokenized_datasets[\"train\"].to_tf_dataset(\n",
 87 |     "    columns=[\"input_ids\", \"labels\"], batch_size=16, shuffle=True\n",
 88 |     ")\n",
 89 |     "\n",
 90 |     "validation_dataset = tokenized_datasets[\"validation_matched\"].to_tf_dataset(\n",
 91 |     "    columns=[\"input_ids\", \"labels\"], batch_size=16, shuffle=True\n",
 92 |     ")\n",
 93 |     "\n",
 94 |     "model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)\n",
 95 |     "\n",
 96 |     "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer='adam')\n",
 97 |     "\n",
 98 |     "model.fit(train_dataset)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "for batch in train_dataset:\n",
108 |     "    break"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "model.compile(optimizer='adam')"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "model = TFAutoModelForSequenceClassification.from_pretrained(\n",
127 |     "    model_checkpoint,\n",
128 |     "    num_labels=3\n",
129 |     ")"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "model.compile(optimizer='adam')"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": []
147 |   }
148 |  ],
149 |  "metadata": {
150 |   "colab": {
151 |    "name": "Debugging the Training Pipeline (TensorFlow)",
152 |    "provenance": []
153 |   }
154 |  },
155 |  "nbformat": 4,
156 |  "nbformat_minor": 4
157 | }
158 | 


--------------------------------------------------------------------------------
/course/chapter8/section3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Asking for help on the forums"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Install the Transformers and Datasets libraries to run this notebook."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "!pip install datasets transformers[sentencepiece]"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from transformers import AutoTokenizer, AutoModel\n",
 33 |     "\n",
 34 |     "model_checkpoint = \"distilbert-base-uncased\"\n",
 35 |     "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
 36 |     "model = AutoModel.from_pretrained(model_checkpoint)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "text = \"\"\"\n",
 46 |     "Generation One is a retroactive term for the Transformers characters that\n",
 47 |     "appeared between 1984 and 1993. The Transformers began with the 1980s Japanese\n",
 48 |     "toy lines Micro Change and Diaclone. They presented robots able to transform\n",
 49 |     "into everyday vehicles, electronic items or weapons. Hasbro bought the Micro\n",
 50 |     "Change and Diaclone toys, and partnered with Takara. Marvel Comics was hired by\n",
 51 |     "Hasbro to create the backstory; editor-in-chief Jim Shooter wrote an overall\n",
 52 |     "story, and gave the task of creating the characthers to writer Dennis O'Neil.\n",
 53 |     "Unhappy with O'Neil's work (although O'Neil created the name \"Optimus Prime\"),\n",
 54 |     "Shooter chose Bob Budiansky to create the characters.\n",
 55 |     "\n",
 56 |     "The Transformers mecha were largely designed by Shōji Kawamori, the creator of\n",
 57 |     "the Japanese mecha anime franchise Macross (which was adapted into the Robotech\n",
 58 |     "franchise in North America). Kawamori came up with the idea of transforming\n",
 59 |     "mechs while working on the Diaclone and Macross franchises in the early 1980s\n",
 60 |     "(such as the VF-1 Valkyrie in Macross and Robotech), with his Diaclone mechs\n",
 61 |     "later providing the basis for Transformers.\n",
 62 |     "\n",
 63 |     "The primary concept of Generation One is that the heroic Optimus Prime, the\n",
 64 |     "villainous Megatron, and their finest soldiers crash land on pre-historic Earth\n",
 65 |     "in the Ark and the Nemesis before awakening in 1985, Cybertron hurtling through\n",
 66 |     "the Neutral zone as an effect of the war. The Marvel comic was originally part\n",
 67 |     "of the main Marvel Universe, with appearances from Spider-Man and Nick Fury,\n",
 68 |     "plus some cameos, as well as a visit to the Savage Land.\n",
 69 |     "\n",
 70 |     "The Transformers TV series began around the same time. Produced by Sunbow\n",
 71 |     "Productions and Marvel Productions, later Hasbro Productions, from the start it\n",
 72 |     "contradicted Budiansky's backstories. The TV series shows the Autobots looking\n",
 73 |     "for new energy sources, and crash landing as the Decepticons attack. Marvel\n",
 74 |     "interpreted the Autobots as destroying a rogue asteroid approaching Cybertron.\n",
 75 |     "Shockwave is loyal to Megatron in the TV series, keeping Cybertron in a\n",
 76 |     "stalemate during his absence, but in the comic book he attempts to take command\n",
 77 |     "of the Decepticons. The TV series would also differ wildly from the origins\n",
 78 |     "Budiansky had created for the Dinobots, the Decepticon turned Autobot Jetfire\n",
 79 |     "(known as Skyfire on TV), the Constructicons (who combine to form\n",
 80 |     "Devastator),[19][20] and Omega Supreme. The Marvel comic establishes early on\n",
 81 |     "that Prime wields the Creation Matrix, which gives life to machines. In the\n",
 82 |     "second season, the two-part episode The Key to Vector Sigma introduced the\n",
 83 |     "ancient Vector Sigma computer, which served the same original purpose as the\n",
 84 |     "Creation Matrix (giving life to Transformers), and its guardian Alpha Trion.\n",
 85 |     "\"\"\"\n",
 86 |     "\n",
 87 |     "inputs = tokenizer(text, return_tensors=\"pt\")\n",
 88 |     "logits = model(**inputs).logits"
 89 |    ]
 90 |   }
 91 |  ],
 92 |  "metadata": {
 93 |   "colab": {
 94 |    "name": "Asking for help on the forums",
 95 |    "provenance": []
 96 |   }
 97 |  },
 98 |  "nbformat": 4,
 99 |  "nbformat_minor": 4
100 | }
101 | 


--------------------------------------------------------------------------------
/sagemaker/02_getting_started_tensorflow/scripts/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | import sys
  5 | 
  6 | import tensorflow as tf
  7 | from datasets import load_dataset
  8 | from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding, create_optimizer
  9 | 
 10 | 
 11 | if __name__ == "__main__":
 12 | 
 13 |     parser = argparse.ArgumentParser()
 14 | 
 15 |     # Hyperparameters sent by the client are passed as command-line arguments to the script.
 16 |     parser.add_argument("--epochs", type=int, default=3)
 17 |     parser.add_argument("--train_batch_size", type=int, default=16)
 18 |     parser.add_argument("--eval_batch_size", type=int, default=8)
 19 |     parser.add_argument("--model_id", type=str)
 20 |     parser.add_argument("--learning_rate", type=str, default=3e-5)
 21 | 
 22 |     # Data, model, and output directories
 23 |     parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
 24 |     parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
 25 |     parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
 26 | 
 27 |     args, _ = parser.parse_known_args()
 28 | 
 29 |     # Set up logging
 30 |     logger = logging.getLogger(__name__)
 31 | 
 32 |     logging.basicConfig(
 33 |         level=logging.getLevelName("INFO"),
 34 |         handlers=[logging.StreamHandler(sys.stdout)],
 35 |         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 36 |     )
 37 | 
 38 |     # Load tokenizer
 39 |     tokenizer = AutoTokenizer.from_pretrained(args.model_id)
 40 | 
 41 |     # Load DatasetDict
 42 |     dataset = load_dataset("imdb")
 43 | 
 44 |     # Preprocess train dataset
 45 |     def preprocess_function(examples):
 46 |         return tokenizer(examples["text"], truncation=True)
 47 | 
 48 |     encoded_dataset = dataset.map(preprocess_function, batched=True)
 49 | 
 50 |     # define tokenizer_columns
 51 |     # tokenizer_columns is the list of keys from the dataset that get passed to the TensorFlow model
 52 |     tokenizer_columns = ["attention_mask", "input_ids"]
 53 | 
 54 |     # convert to TF datasets
 55 |     data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
 56 |     encoded_dataset["train"] = encoded_dataset["train"].rename_column("label", "labels")
 57 |     tf_train_dataset = encoded_dataset["train"].to_tf_dataset(
 58 |         columns=tokenizer_columns,
 59 |         label_cols=["labels"],
 60 |         shuffle=True,
 61 |         batch_size=8,
 62 |         collate_fn=data_collator,
 63 |     )
 64 |     encoded_dataset["test"] = encoded_dataset["test"].rename_column("label", "labels")
 65 |     tf_validation_dataset = encoded_dataset["test"].to_tf_dataset(
 66 |         columns=tokenizer_columns,
 67 |         label_cols=["labels"],
 68 |         shuffle=False,
 69 |         batch_size=8,
 70 |         collate_fn=data_collator,
 71 |     )
 72 | 
 73 |     # Prepare model labels - useful in inference API
 74 |     labels = encoded_dataset["train"].features["labels"].names
 75 |     num_labels = len(labels)
 76 |     label2id, id2label = dict(), dict()
 77 |     for i, label in enumerate(labels):
 78 |         label2id[label] = str(i)
 79 |         id2label[str(i)] = label
 80 | 
 81 |     # download model from model hub
 82 |     model = TFAutoModelForSequenceClassification.from_pretrained(
 83 |         args.model_id, num_labels=num_labels, label2id=label2id, id2label=id2label
 84 |     )
 85 | 
 86 |     # create Adam optimizer with learning rate scheduling
 87 |     batches_per_epoch = len(encoded_dataset["train"]) // args.train_batch_size
 88 |     total_train_steps = int(batches_per_epoch * args.epochs)
 89 | 
 90 |     optimizer, _ = create_optimizer(init_lr=args.learning_rate, num_warmup_steps=0, num_train_steps=total_train_steps)
 91 |     loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 92 | 
 93 |     # define metric and compile model
 94 |     metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
 95 |     model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
 96 | 
 97 |     # Training
 98 |     logger.info("*** Train ***")
 99 |     train_results = model.fit(
100 |         tf_train_dataset,
101 |         epochs=args.epochs,
102 |         validation_data=tf_validation_dataset,
103 |     )
104 | 
105 |     output_eval_file = os.path.join(args.output_data_dir, "train_results.txt")
106 | 
107 |     with open(output_eval_file, "w") as writer:
108 |         logger.info("***** Train results *****")
109 |         logger.info(train_results)
110 |         for key, value in train_results.history.items():
111 |             logger.info("  %s = %s", key, value)
112 |             writer.write("%s = %s\n" % (key, value))
113 | 
114 |     # Save result
115 |     model.save_pretrained(args.model_dir)
116 |     tokenizer.save_pretrained(args.model_dir)
117 | 


--------------------------------------------------------------------------------
/course/videos/save_load_dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/blF9uxYcKHo?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/blF9uxYcKHo?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from datasets import load_dataset\n",
 61 |     "\n",
 62 |     "raw_datasets = load_dataset(\"allocine\")\n",
 63 |     "raw_datasets.cache_files"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "raw_datasets.save_to_disk(\"my-arrow-datasets\")"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "from datasets import load_from_disk\n",
 82 |     "\n",
 83 |     "arrow_datasets_reloaded = load_from_disk(\"my-arrow-datasets\")\n",
 84 |     "arrow_datasets_reloaded"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "for split, dataset in raw_datasets.items():\n",
 94 |     "    dataset.to_csv(f\"my-dataset-{split}.csv\", index=None)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "data_files = {\n",
104 |     "    \"train\": \"my-dataset-train.csv\",\n",
105 |     "    \"validation\": \"my-dataset-validation.csv\",\n",
106 |     "    \"test\": \"my-dataset-test.csv\",\n",
107 |     "}\n",
108 |     "\n",
109 |     "csv_datasets_reloaded = load_dataset(\"csv\", data_files=data_files)\n",
110 |     "csv_datasets_reloaded"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "# Save in JSON Lines format\n",
120 |     "for split, dataset in raw_datasets.items():\n",
121 |     "    dataset.to_json(f\"my-dataset-{split}.jsonl\")\n",
122 |     "\n",
123 |     "# Save in Parquet format\n",
124 |     "for split, dataset in raw_datasets.items():\n",
125 |     "    dataset.to_parquet(f\"my-dataset-{split}.parquet\")"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "json_data_files = {\n",
135 |     "    \"train\": \"my-dataset-train.jsonl\",\n",
136 |     "    \"validation\": \"my-dataset-validation.jsonl\",\n",
137 |     "    \"test\": \"my-dataset-test.jsonl\",\n",
138 |     "}\n",
139 |     "\n",
140 |     "parquet_data_files = {\n",
141 |     "    \"train\": \"my-dataset-train.parquet\",\n",
142 |     "    \"validation\": \"my-dataset-validation.parquet\",\n",
143 |     "    \"test\": \"my-dataset-test.parquet\",\n",
144 |     "}\n",
145 |     "\n",
146 |     "# Reload with the `json` script\n",
147 |     "json_datasets_reloaded = load_dataset(\"json\", data_files=json_data_files)\n",
148 |     "# Reload with the `parquet` script\n",
149 |     "parquet_datasets_reloaded = load_dataset(\"parquet\", data_files=parquet_data_files)"
150 |    ]
151 |   }
152 |  ],
153 |  "metadata": {
154 |   "colab": {
155 |    "name": "Saving and reloading a dataset",
156 |    "provenance": []
157 |   }
158 |  },
159 |  "nbformat": 4,
160 |  "nbformat_minor": 4
161 | }
162 | 


--------------------------------------------------------------------------------
/sagemaker/15_training_compiler/scripts/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | import random
  5 | import sys
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from datasets import load_from_disk, load_metric
 10 | from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
 11 | from transformers.trainer_utils import get_last_checkpoint
 12 | 
 13 | if __name__ == "__main__":
 14 | 
 15 |     parser = argparse.ArgumentParser()
 16 | 
 17 |     # hyperparameters sent by the client are passed as command-line arguments to the script.
 18 |     parser.add_argument("--epochs", type=int, default=3)
 19 |     parser.add_argument("--train_batch_size", type=int, default=32)
 20 |     parser.add_argument("--eval_batch_size", type=int, default=64)
 21 |     parser.add_argument("--warmup_steps", type=int, default=500)
 22 |     parser.add_argument("--model_id", type=str)
 23 |     parser.add_argument("--learning_rate", type=str, default=5e-5)
 24 |     parser.add_argument("--fp16", type=bool, default=True)
 25 | 
 26 |     # Data, model, and output directories
 27 |     parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
 28 |     parser.add_argument("--output_dir", type=str, default=os.environ["SM_MODEL_DIR"])
 29 |     parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
 30 |     parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
 31 |     parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
 32 | 
 33 |     args, _ = parser.parse_known_args()
 34 | 
 35 |     # is needed for Amazon SageMaker Training Compiler
 36 |     os.environ["GPU_NUM_DEVICES"] = args.n_gpus
 37 | 
 38 |     # Set up logging
 39 |     logger = logging.getLogger(__name__)
 40 | 
 41 |     logging.basicConfig(
 42 |         level=logging.getLevelName("INFO"),
 43 |         handlers=[logging.StreamHandler(sys.stdout)],
 44 |         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 45 |     )
 46 | 
 47 |     # load datasets
 48 |     train_dataset = load_from_disk(args.training_dir)
 49 |     test_dataset = load_from_disk(args.test_dir)
 50 | 
 51 |     logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
 52 |     logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
 53 | 
 54 |     # define metrics and metrics function
 55 |     metric = load_metric("accuracy")
 56 | 
 57 |     def compute_metrics(eval_pred):
 58 |         predictions, labels = eval_pred
 59 |         predictions = np.argmax(predictions, axis=1)
 60 |         return metric.compute(predictions=predictions, references=labels)
 61 | 
 62 |     # Prepare model labels - useful in inference API
 63 |     labels = train_dataset.features["labels"].names
 64 |     num_labels = len(labels)
 65 |     label2id, id2label = dict(), dict()
 66 |     for i, label in enumerate(labels):
 67 |         label2id[label] = str(i)
 68 |         id2label[str(i)] = label
 69 | 
 70 |     # download model from model hub
 71 |     model = AutoModelForSequenceClassification.from_pretrained(
 72 |         args.model_id, num_labels=num_labels, label2id=label2id, id2label=id2label
 73 |     )
 74 |     tokenizer = AutoTokenizer.from_pretrained(args.model_id)
 75 | 
 76 |     # define training args
 77 |     training_args = TrainingArguments(
 78 |         output_dir=args.output_dir,
 79 |         overwrite_output_dir=True if get_last_checkpoint(args.output_dir) is not None else False,
 80 |         num_train_epochs=args.epochs,
 81 |         per_device_train_batch_size=args.train_batch_size,
 82 |         per_device_eval_batch_size=args.eval_batch_size,
 83 |         warmup_steps=args.warmup_steps,
 84 |         fp16=args.fp16,
 85 |         evaluation_strategy="epoch",
 86 |         save_strategy="epoch",
 87 |         save_total_limit=2,
 88 |         logging_dir=f"{args.output_data_dir}/logs",
 89 |         learning_rate=float(args.learning_rate),
 90 |         load_best_model_at_end=True,
 91 |         metric_for_best_model="accuracy",
 92 |         disable_tqdm=True,
 93 |     )
 94 | 
 95 |     # create Trainer instance
 96 |     trainer = Trainer(
 97 |         model=model,
 98 |         args=training_args,
 99 |         compute_metrics=compute_metrics,
100 |         train_dataset=train_dataset,
101 |         eval_dataset=test_dataset,
102 |         tokenizer=tokenizer,
103 |     )
104 | 
105 |     # train model
106 |     trainer.train()
107 | 
108 |     # evaluate model
109 |     eval_result = trainer.evaluate(eval_dataset=test_dataset)
110 | 
111 |     # writes eval result to file which can be accessed later in s3 ouput
112 |     with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer:
113 |         print(f"***** Eval results *****")
114 |         for key, value in sorted(eval_result.items()):
115 |             writer.write(f"{key} = {value}\n")
116 |             print(f"{key} = {value}\n")
117 | 
118 |     # Saves the model to s3 uses os.environ["SM_MODEL_DIR"] to make sure checkpointing works
119 |     trainer.save_model(os.environ["SM_MODEL_DIR"])
120 | 


--------------------------------------------------------------------------------
/course/videos/mlm_processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/8PmhEIXhBvI?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/8PmhEIXhBvI?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from datasets import load_dataset\n",
 61 |     "\n",
 62 |     "raw_datasets = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\")\n",
 63 |     "raw_datasets[\"train\"]"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "from datasets import load_dataset\n",
 73 |     "from transformers import AutoTokenizer\n",
 74 |     "\n",
 75 |     "raw_datasets = load_dataset(\"imdb\")\n",
 76 |     "raw_datasets = raw_datasets.remove_columns(\"label\")\n",
 77 |     "\n",
 78 |     "model_checkpoint = \"distilbert-base-cased\"\n",
 79 |     "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
 80 |     "context_length = 128\n",
 81 |     "\n",
 82 |     "def tokenize_pad_and_truncate(texts):\n",
 83 |     "    return tokenizer(texts[\"text\"], truncation=True, padding=\"max_length\", max_length=context_length)\n",
 84 |     "\n",
 85 |     "tokenized_datasets = raw_datasets.map(tokenize_pad_and_truncate, batched=True)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "def tokenize_and_chunk(texts):\n",
 95 |     "    return tokenizer(\n",
 96 |     "        texts[\"text\"], truncation=True, max_length=context_length,\n",
 97 |     "        return_overflowing_tokens=True\n",
 98 |     "    )\n",
 99 |     "\n",
100 |     "tokenized_datasets = raw_datasets.map(\n",
101 |     "    tokenize_and_chunk, batched=True, remove_columns=[\"text\"]\n",
102 |     ")\n",
103 |     "\n",
104 |     "len(raw_datasets[\"train\"]), len(tokenized_datasets[\"train\"])"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "def tokenize_and_chunk(texts):\n",
114 |     "    all_input_ids = []\n",
115 |     "    for input_ids in tokenizer(texts[\"text\"])[\"input_ids\"]:\n",
116 |     "        all_input_ids.extend(input_ids)\n",
117 |     "        all_input_ids.append(tokenizer.eos_token_id)\n",
118 |     "    \n",
119 |     "    chunks = []\n",
120 |     "    for idx in range(0, len(all_input_ids), context_length):\n",
121 |     "        chunks.append(all_input_ids[idx: idx + context_length])\n",
122 |     "    return {\"input_ids\": chunks}\n",
123 |     "\n",
124 |     "tokenized_datasets = raw_datasets.map(tokenize_and_chunk, batched=True, remove_columns=[\"text\"])\n",
125 |     "\n",
126 |     "len(raw_datasets[\"train\"]), len(tokenized_datasets[\"train\"])"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "from transformers import DataCollatorForLanguageModeling\n",
136 |     "\n",
137 |     "data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": []
146 |   }
147 |  ],
148 |  "metadata": {
149 |   "colab": {
150 |    "name": "Data processing for Masked Language Modeling",
151 |    "provenance": []
152 |   }
153 |  },
154 |  "nbformat": 4,
155 |  "nbformat_minor": 4
156 | }
157 | 


--------------------------------------------------------------------------------
/course/videos/building_tokenizer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/MR8tZm5ViWU?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/MR8tZm5ViWU?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from datasets import load_dataset\n",
 61 |     "\n",
 62 |     "dataset = load_dataset(\"wikitext\", name=\"wikitext-2-raw-v1\", split=\"train\")\n",
 63 |     "\n",
 64 |     "\n",
 65 |     "def get_training_corpus():\n",
 66 |     "    for i in range(0, len(dataset), 1000):\n",
 67 |     "        yield dataset[i : i + 1000][\"text\"]"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "tokenizer = Tokenizer(models.WordPiece(unk_token=\"[UNK]\"))"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "tokenizer.normalizer = normalizers.Sequence(\n",
 95 |     "    [\n",
 96 |     "        normalizers.Replace(Regex(r\"[\\p{Other}&&[^\\n\\t\\r]]\"), \"\"),\n",
 97 |     "        normalizers.Replace(Regex(r\"[\\s]\"), \" \"),\n",
 98 |     "        normalizers.Lowercase(),\n",
 99 |     "        normalizers.NFD(), normalizers.StripAccents()]\n",
100 |     ")"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "special_tokens = [\"[UNK]\", \"[PAD]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"]\n",
119 |     "trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "cls_token_id = tokenizer.token_to_id(\"[CLS]\")\n",
138 |     "sep_token_id = tokenizer.token_to_id(\"[SEP]\")\n",
139 |     "tokenizer.post_processor = processors.TemplateProcessing(\n",
140 |     "    single=f\"[CLS]:0 $A:0 [SEP]:0\",\n",
141 |     "    pair=f\"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1\",\n",
142 |     "    special_tokens=[(\"[CLS]\", cls_token_id), (\"[SEP]\", sep_token_id)],\n",
143 |     ")"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "tokenizer.decoder = decoders.WordPiece(prefix=\"##\")"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": []
161 |   }
162 |  ],
163 |  "metadata": {
164 |   "colab": {
165 |    "name": "Building a new tokenizer",
166 |    "provenance": []
167 |   }
168 |  },
169 |  "nbformat": 4,
170 |  "nbformat_minor": 4
171 | }
172 | 


--------------------------------------------------------------------------------
/course/videos/memory_mapping_streaming.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/JwISwTCPPWo?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/JwISwTCPPWo?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from datasets import load_dataset\n",
 61 |     "\n",
 62 |     "data_files = \"https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst\"\n",
 63 |     "large_dataset = load_dataset(\"json\", data_files=data_files, split=\"train\")\n",
 64 |     "size_gb = large_dataset.dataset_size / (1024 ** 3)\n",
 65 |     "print(f\"Dataset size (cache file) : {size_gb:.2f} GB\")"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "import psutil\n",
 75 |     "\n",
 76 |     "# Process.memory_info is expressed in bytes, so convert to megabytes\n",
 77 |     "print(f\"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB\")"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "import timeit\n",
 87 |     "\n",
 88 |     "code_snippet = \"\"\"batch_size = 1000\n",
 89 |     "\n",
 90 |     "for idx in range(0, len(large_dataset), batch_size):\n",
 91 |     "    _ = large_dataset[idx:idx + batch_size]\n",
 92 |     "\"\"\"\n",
 93 |     "\n",
 94 |     "time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())\n",
 95 |     "print(\n",
 96 |     "    f\"Iterated over {len(large_dataset)} examples (about {size_gb:.1f} GB) in \"\n",
 97 |     "    f\"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s\"\n",
 98 |     ")"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "large_dataset_streamed = load_dataset(\n",
108 |     "    \"json\", data_files=data_files, split=\"train\", streaming=True)\n",
109 |     "\n",
110 |     "next(iter(large_dataset_streamed))"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "type(large_dataset_streamed)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "from transformers import AutoTokenizer\n",
129 |     "\n",
130 |     "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
131 |     "tokenized_dataset = large_dataset_streamed.map(lambda x: tokenizer(x[\"text\"]))\n",
132 |     "next(iter(tokenized_dataset))"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "# Select the first 5 examples \n",
142 |     "dataset_head = large_dataset_streamed.take(5)\n",
143 |     "list(dataset_head)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "# Skip the first 1,000 examples and include the rest in the training set\n",
153 |     "train_dataset = large_dataset_streamed.skip(1000)\n",
154 |     "# Take the first 1,000 examples for the validation set\n",
155 |     "validation_dataset = large_dataset_streamed.take(1000)"
156 |    ]
157 |   }
158 |  ],
159 |  "metadata": {
160 |   "colab": {
161 |    "name": "Memory Mapping & streaming",
162 |    "provenance": []
163 |   }
164 |  },
165 |  "nbformat": 4,
166 |  "nbformat_minor": 4
167 | }
168 | 


--------------------------------------------------------------------------------
/course/videos/train_new_tokenizer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/DJimQynXZsQ?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/DJimQynXZsQ?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from transformers import BertTokenizerFast\n",
 61 |     "\n",
 62 |     "tokenizer = BertTokenizerFast.from_pretrained(\n",
 63 |     "  'huggingface-course/bert-base-uncased-tokenizer-without-normalizer'\n",
 64 |     ")"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "text = \"here is a sentence adapted to our tokenizer\"\n",
 74 |     "print(tokenizer.tokenize(text))"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "text = \"এই বাক্যটি আমাদের টোকেনাইজারের উপযুক্ত নয়\"\n",
 84 |     "print(tokenizer.tokenize(text))"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "text = \"this tokenizer does not know àccënts and CAPITAL LETTERS\"\n",
 94 |     "print(tokenizer.tokenize(text))"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "text = \"the medical vocabulary is divided into many sub-token: paracetamol, phrayngitis\"\n",
104 |     "print(tokenizer.tokenize(text))"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "from datasets import load_dataset\n",
114 |     "\n",
115 |     "raw_datasets = load_dataset(\"code_search_net\", \"python\")"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "def get_training_corpus():\n",
125 |     "    dataset = raw_datasets[\"train\"]\n",
126 |     "    for start_idx in range(0, len(dataset), 1000):\n",
127 |     "        samples = dataset[start_idx : start_idx + 1000]\n",
128 |     "        yield samples[\"whole_func_string\"]"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "from transformers import AutoTokenizer\n",
138 |     "\n",
139 |     "old_tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
140 |     "training_corpus = get_training_corpus()\n",
141 |     "new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)\n",
142 |     "new_tokenizer.save_pretrained(\"code-search-net-tokenizer\")"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "example = \"\"\"class LinearLayer():\n",
152 |     "    def __init__(self, input_size, output_size):\n",
153 |     "        self.weight = torch.randn(input_size, output_size)\n",
154 |     "        self.bias = torch.zeros(output_size)\n",
155 |     "\n",
156 |     "    def __call__(self, x):\n",
157 |     "        return x @ self.weights + self.bias\n",
158 |     "    \"\"\"\n",
159 |     "\n",
160 |     "print(old_tokenizer.tokenize(example))\n",
161 |     "print(new_tokenizer.tokenize(example))"
162 |    ]
163 |   }
164 |  ],
165 |  "metadata": {
166 |   "colab": {
167 |    "name": "Training a new tokenizer",
168 |    "provenance": []
169 |   }
170 |  },
171 |  "nbformat": 4,
172 |  "nbformat_minor": 4
173 | }
174 | 


--------------------------------------------------------------------------------
/course/chapter5/section2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# What if my dataset isn't on the Hub?"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Install the Transformers and Datasets libraries to run this notebook."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "!pip install datasets transformers[sentencepiece]"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz\n",
 33 |     "!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "!gzip -dkv SQuAD_it-*.json.gz"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "from datasets import load_dataset\n",
 52 |     "\n",
 53 |     "squad_it_dataset = load_dataset(\"json\", data_files=\"SQuAD_it-train.json\", field=\"data\")"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "data": {
 63 |       "text/plain": [
 64 |        "DatasetDict({\n",
 65 |        "    train: Dataset({\n",
 66 |        "        features: ['title', 'paragraphs'],\n",
 67 |        "        num_rows: 442\n",
 68 |        "    })\n",
 69 |        "})"
 70 |       ]
 71 |      },
 72 |      "execution_count": null,
 73 |      "metadata": {},
 74 |      "output_type": "execute_result"
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "squad_it_dataset"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "data": {
 88 |       "text/plain": [
 89 |        "{\n",
 90 |        "    \"title\": \"Terremoto del Sichuan del 2008\",\n",
 91 |        "    \"paragraphs\": [\n",
 92 |        "        {\n",
 93 |        "            \"context\": \"Il terremoto del Sichuan del 2008 o il terremoto...\",\n",
 94 |        "            \"qas\": [\n",
 95 |        "                {\n",
 96 |        "                    \"answers\": [{\"answer_start\": 29, \"text\": \"2008\"}],\n",
 97 |        "                    \"id\": \"56cdca7862d2951400fa6826\",\n",
 98 |        "                    \"question\": \"In quale anno si è verificato il terremoto nel Sichuan?\",\n",
 99 |        "                },\n",
100 |        "                ...\n",
101 |        "            ],\n",
102 |        "        },\n",
103 |        "        ...\n",
104 |        "    ],\n",
105 |        "}"
106 |       ]
107 |      },
108 |      "execution_count": null,
109 |      "metadata": {},
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "squad_it_dataset[\"train\"][0]"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "data": {
124 |       "text/plain": [
125 |        "DatasetDict({\n",
126 |        "    train: Dataset({\n",
127 |        "        features: ['title', 'paragraphs'],\n",
128 |        "        num_rows: 442\n",
129 |        "    })\n",
130 |        "    test: Dataset({\n",
131 |        "        features: ['title', 'paragraphs'],\n",
132 |        "        num_rows: 48\n",
133 |        "    })\n",
134 |        "})"
135 |       ]
136 |      },
137 |      "execution_count": null,
138 |      "metadata": {},
139 |      "output_type": "execute_result"
140 |     }
141 |    ],
142 |    "source": [
143 |     "data_files = {\"train\": \"SQuAD_it-train.json\", \"test\": \"SQuAD_it-test.json\"}\n",
144 |     "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")\n",
145 |     "squad_it_dataset"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "data_files = {\"train\": \"SQuAD_it-train.json.gz\", \"test\": \"SQuAD_it-test.json.gz\"}\n",
155 |     "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "url = \"https://github.com/crux82/squad-it/raw/master/\"\n",
165 |     "data_files = {\n",
166 |     "    \"train\": url + \"SQuAD_it-train.json.gz\",\n",
167 |     "    \"test\": url + \"SQuAD_it-test.json.gz\",\n",
168 |     "}\n",
169 |     "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")"
170 |    ]
171 |   }
172 |  ],
173 |  "metadata": {
174 |   "colab": {
175 |    "name": "What if my dataset isn't on the Hub?",
176 |    "provenance": []
177 |   }
178 |  },
179 |  "nbformat": 4,
180 |  "nbformat_minor": 4
181 | }
182 | 


--------------------------------------------------------------------------------
/course/videos/slice_and_dice.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/tqfSFcPMgOI?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/tqfSFcPMgOI?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from datasets import load_dataset\n",
 61 |     "\n",
 62 |     "squad = load_dataset(\"squad\", split=\"train\")\n",
 63 |     "squad[0]"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "squad_shuffled = squad.shuffle(seed=666)\n",
 73 |     "squad_shuffled[0]"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "dataset = squad.train_test_split(test_size=0.1)\n",
 83 |     "dataset"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "indices = [0, 10, 20, 40, 80]\n",
 93 |     "examples = squad.select(indices)\n",
 94 |     "examples"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "sample = squad.shuffle().select(range(5))\n",
104 |     "sample"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "squad_filtered = squad.filter(lambda x : x[\"title\"].startswith(\"L\"))\n",
114 |     "squad_filtered[0]"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "squad.rename_column(\"context\", \"passages\")"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "squad.remove_columns([\"id\", \"title\"])"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "squad"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "squad.flatten()"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "def lowercase_title(example):\n",
160 |     "    return {\"title\": example[\"title\"].lower()}\n",
161 |     "\n",
162 |     "squad_lowercase = squad.map(lowercase_title)\n",
163 |     "# Peek at random sample\n",
164 |     "squad_lowercase.shuffle(seed=42)[\"title\"][:5]"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "from transformers import AutoTokenizer\n",
174 |     "\n",
175 |     "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
176 |     "\n",
177 |     "def tokenize_title(example):\n",
178 |     "    return tokenizer(example[\"title\"])\n",
179 |     "\n",
180 |     "squad.map(tokenize_title, batched=True, batch_size=500)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": []
189 |   }
190 |  ],
191 |  "metadata": {
192 |   "colab": {
193 |    "name": "Slide and dice a dataset 🔪",
194 |    "provenance": []
195 |   }
196 |  },
197 |  "nbformat": 4,
198 |  "nbformat_minor": 4
199 | }
200 | 


--------------------------------------------------------------------------------
/course/videos/custom_loss.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/Hm8_PgVTFuc?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/Hm8_PgVTFuc?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
 61 |     "from accelerate import Accelerator\n",
 62 |     "\n",
 63 |     "accelerator = Accelerator()\n",
 64 |     "tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/code-search-net-tokenizer\")\n",
 65 |     "model = AutoModelForCausalLM.from_pretrained(\"huggingface-course/codeparrot-ds\")\n",
 66 |     "\n",
 67 |     "keytoken_ids = []\n",
 68 |     "for keyword in [\n",
 69 |     "    \"plt\",\n",
 70 |     "    \"pd\",\n",
 71 |     "    \"sk\",\n",
 72 |     "    \"fit\",\n",
 73 |     "    \"predict\",\n",
 74 |     "    \" plt\",\n",
 75 |     "    \" pd\",\n",
 76 |     "    \" sk\",\n",
 77 |     "    \" fit\",\n",
 78 |     "    \" predict\",\n",
 79 |     "]:\n",
 80 |     "    ids = tokenizer([keyword]).input_ids[0]\n",
 81 |     "    keytoken_ids.append(ids[0])\n",
 82 |     "\n",
 83 |     "batch = tokenizer([\"import numpy as np\"], return_tensors=\"pt\")\n",
 84 |     "model = accelerator.prepare(model)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "from torch.nn import CrossEntropyLoss\n",
 94 |     "import torch\n",
 95 |     "\n",
 96 |     "\n",
 97 |     "def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):\n",
 98 |     "    # Shift so that tokens < n predict n\n",
 99 |     "    shift_labels = inputs[..., 1:].contiguous()\n",
100 |     "    shift_logits = logits[..., :-1, :].contiguous()\n",
101 |     "    # Calculate per-token loss\n",
102 |     "    loss_fct = CrossEntropyLoss(reduce=False)\n",
103 |     "    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))\n",
104 |     "    # Resize and average loss per sample\n",
105 |     "    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)\n",
106 |     "    # Calculate and scale weighting\n",
107 |     "    weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(\n",
108 |     "        axis=[0, 2]\n",
109 |     "    )\n",
110 |     "    weights = alpha * (1.0 + weights)\n",
111 |     "    # Calculate weighted average\n",
112 |     "    weighted_loss = (loss_per_sample * weights).mean()\n",
113 |     "    return weighted_loss"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "logits = model(batch[\"input_ids\"]).logits\n",
123 |     "loss = keytoken_weighted_loss(batch[\"input_ids\"], logits, keytoken_ids)\n",
124 |     "accelerator.backward(loss)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "from transformers import Trainer\n",
134 |     "\n",
135 |     "class MyTrainer(Trainer):\n",
136 |     "    def compute_loss(self, model, inputs, return_outputs=False):\n",
137 |     "        input_ids = inputs.get(\"input_ids\")\n",
138 |     "        outputs = model(input_ids)\n",
139 |     "        loss = keytoken_weighted_loss(input_ids, outputs.logits, keytoken_ids)\n",
140 |     "\n",
141 |     "        return (loss, outputs) if return_outputs else loss"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": []
150 |   }
151 |  ],
152 |  "metadata": {
153 |   "colab": {
154 |    "name": "Using a custom loss function",
155 |    "provenance": []
156 |   }
157 |  },
158 |  "nbformat": 4,
159 |  "nbformat_minor": 4
160 | }
161 | 


--------------------------------------------------------------------------------
/course/chapter3/section3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Fine-tuning a model with the Trainer API"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Install the Transformers and Datasets libraries to run this notebook."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "!pip install datasets transformers[sentencepiece]"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from datasets import load_dataset\n",
 33 |     "from transformers import AutoTokenizer, DataCollatorWithPadding\n",
 34 |     "\n",
 35 |     "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
 36 |     "checkpoint = \"bert-base-uncased\"\n",
 37 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
 38 |     "\n",
 39 |     "\n",
 40 |     "def tokenize_function(example):\n",
 41 |     "    return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n",
 42 |     "\n",
 43 |     "\n",
 44 |     "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
 45 |     "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "from transformers import TrainingArguments\n",
 55 |     "\n",
 56 |     "training_args = TrainingArguments(\"test-trainer\")"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "from transformers import AutoModelForSequenceClassification\n",
 66 |     "\n",
 67 |     "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "from transformers import Trainer\n",
 77 |     "\n",
 78 |     "trainer = Trainer(\n",
 79 |     "    model,\n",
 80 |     "    training_args,\n",
 81 |     "    train_dataset=tokenized_datasets[\"train\"],\n",
 82 |     "    eval_dataset=tokenized_datasets[\"validation\"],\n",
 83 |     "    data_collator=data_collator,\n",
 84 |     "    tokenizer=tokenizer,\n",
 85 |     ")"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "trainer.train()"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [
102 |     {
103 |      "data": {
104 |       "text/plain": [
105 |        "(408, 2) (408,)"
106 |       ]
107 |      },
108 |      "execution_count": null,
109 |      "metadata": {},
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "predictions = trainer.predict(tokenized_datasets[\"validation\"])\n",
115 |     "print(predictions.predictions.shape, predictions.label_ids.shape)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "import numpy as np\n",
125 |     "\n",
126 |     "preds = np.argmax(predictions.predictions, axis=-1)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [
134 |     {
135 |      "data": {
136 |       "text/plain": [
137 |        "{'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}"
138 |       ]
139 |      },
140 |      "execution_count": null,
141 |      "metadata": {},
142 |      "output_type": "execute_result"
143 |     }
144 |    ],
145 |    "source": [
146 |     "from datasets import load_metric\n",
147 |     "\n",
148 |     "metric = load_metric(\"glue\", \"mrpc\")\n",
149 |     "metric.compute(predictions=preds, references=predictions.label_ids)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "def compute_metrics(eval_preds):\n",
159 |     "    metric = load_metric(\"glue\", \"mrpc\")\n",
160 |     "    logits, labels = eval_preds\n",
161 |     "    predictions = np.argmax(logits, axis=-1)\n",
162 |     "    return metric.compute(predictions=predictions, references=labels)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "training_args = TrainingArguments(\"test-trainer\", evaluation_strategy=\"epoch\")\n",
172 |     "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
173 |     "\n",
174 |     "trainer = Trainer(\n",
175 |     "    model,\n",
176 |     "    training_args,\n",
177 |     "    train_dataset=tokenized_datasets[\"train\"],\n",
178 |     "    eval_dataset=tokenized_datasets[\"validation\"],\n",
179 |     "    data_collator=data_collator,\n",
180 |     "    tokenizer=tokenizer,\n",
181 |     "    compute_metrics=compute_metrics,\n",
182 |     ")"
183 |    ]
184 |   }
185 |  ],
186 |  "metadata": {
187 |   "colab": {
188 |    "name": "Fine-tuning a model with the Trainer API",
189 |    "provenance": []
190 |   }
191 |  },
192 |  "nbformat": 4,
193 |  "nbformat_minor": 4
194 | }
195 | 


--------------------------------------------------------------------------------
/sagemaker/14_train_and_push_to_hub/scripts/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | import random
  5 | import sys
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from datasets import load_from_disk, load_metric
 10 | from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
 11 | from transformers.trainer_utils import get_last_checkpoint
 12 | 
 13 | if __name__ == "__main__":
 14 | 
 15 |     parser = argparse.ArgumentParser()
 16 | 
 17 |     # hyperparameters sent by the client are passed as command-line arguments to the script.
 18 |     parser.add_argument("--epochs", type=int, default=3)
 19 |     parser.add_argument("--train_batch_size", type=int, default=32)
 20 |     parser.add_argument("--eval_batch_size", type=int, default=64)
 21 |     parser.add_argument("--warmup_steps", type=int, default=500)
 22 |     parser.add_argument("--model_id", type=str)
 23 |     parser.add_argument("--learning_rate", type=str, default=5e-5)
 24 |     parser.add_argument("--fp16", type=bool, default=True)
 25 | 
 26 |     # Push to Hub Parameters
 27 |     parser.add_argument("--push_to_hub", type=bool, default=True)
 28 |     parser.add_argument("--hub_model_id", type=str, default=None)
 29 |     parser.add_argument("--hub_strategy", type=str, default=None)
 30 |     parser.add_argument("--hub_token", type=str, default=None)
 31 | 
 32 |     # Data, model, and output directories
 33 |     parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
 34 |     parser.add_argument("--output_dir", type=str, default=os.environ["SM_MODEL_DIR"])
 35 |     parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
 36 |     parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
 37 |     parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
 38 | 
 39 |     args, _ = parser.parse_known_args()
 40 | 
 41 |     # make sure we have required parameters to push
 42 |     if args.push_to_hub:
 43 |         if args.hub_strategy is None:
 44 |             raise ValueError("--hub_strategy is required when pushing to Hub")
 45 |         if args.hub_token is None:
 46 |             raise ValueError("--hub_token is required when pushing to Hub")
 47 | 
 48 |     # sets hub id if not provided
 49 |     if args.hub_model_id is None:
 50 |         args.hub_model_id = args.model_id.replace("/", "--")
 51 | 
 52 |     # Set up logging
 53 |     logger = logging.getLogger(__name__)
 54 | 
 55 |     logging.basicConfig(
 56 |         level=logging.getLevelName("INFO"),
 57 |         handlers=[logging.StreamHandler(sys.stdout)],
 58 |         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 59 |     )
 60 | 
 61 |     # load datasets
 62 |     train_dataset = load_from_disk(args.training_dir)
 63 |     test_dataset = load_from_disk(args.test_dir)
 64 | 
 65 |     logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
 66 |     logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
 67 | 
 68 |     # define metrics and metrics function
 69 |     metric = load_metric("accuracy")
 70 | 
 71 |     def compute_metrics(eval_pred):
 72 |         predictions, labels = eval_pred
 73 |         predictions = np.argmax(predictions, axis=1)
 74 |         return metric.compute(predictions=predictions, references=labels)
 75 | 
 76 |     # Prepare model labels - useful in inference API
 77 |     labels = train_dataset.features["labels"].names
 78 |     num_labels = len(labels)
 79 |     label2id, id2label = dict(), dict()
 80 |     for i, label in enumerate(labels):
 81 |         label2id[label] = str(i)
 82 |         id2label[str(i)] = label
 83 | 
 84 |     # download model from model hub
 85 |     model = AutoModelForSequenceClassification.from_pretrained(
 86 |         args.model_id, num_labels=num_labels, label2id=label2id, id2label=id2label
 87 |     )
 88 |     tokenizer = AutoTokenizer.from_pretrained(args.model_id)
 89 | 
 90 |     # define training args
 91 |     training_args = TrainingArguments(
 92 |         output_dir=args.output_dir,
 93 |         overwrite_output_dir=True if get_last_checkpoint(args.output_dir) is not None else False,
 94 |         num_train_epochs=args.epochs,
 95 |         per_device_train_batch_size=args.train_batch_size,
 96 |         per_device_eval_batch_size=args.eval_batch_size,
 97 |         warmup_steps=args.warmup_steps,
 98 |         fp16=args.fp16,
 99 |         evaluation_strategy="epoch",
100 |         save_strategy="epoch",
101 |         save_total_limit=2,
102 |         logging_dir=f"{args.output_data_dir}/logs",
103 |         learning_rate=float(args.learning_rate),
104 |         load_best_model_at_end=True,
105 |         metric_for_best_model="accuracy",
106 |         # push to hub parameters
107 |         push_to_hub=args.push_to_hub,
108 |         hub_strategy=args.hub_strategy,
109 |         hub_model_id=args.hub_model_id,
110 |         hub_token=args.hub_token,
111 |     )
112 | 
113 |     # create Trainer instance
114 |     trainer = Trainer(
115 |         model=model,
116 |         args=training_args,
117 |         compute_metrics=compute_metrics,
118 |         train_dataset=train_dataset,
119 |         eval_dataset=test_dataset,
120 |         tokenizer=tokenizer,
121 |     )
122 | 
123 |     # train model
124 |     trainer.train()
125 | 
126 |     # evaluate model
127 |     eval_result = trainer.evaluate(eval_dataset=test_dataset)
128 | 
129 |     # save best model, metrics and create model card
130 |     trainer.create_model_card(model_name=args.hub_model_id)
131 |     trainer.push_to_hub()
132 | 
133 |     # Saves the model to s3 uses os.environ["SM_MODEL_DIR"] to make sure checkpointing works
134 |     trainer.save_model(os.environ["SM_MODEL_DIR"])
135 | 


--------------------------------------------------------------------------------
/course/videos/token_processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/iY2AZYdZAr0?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/iY2AZYdZAr0?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from datasets import load_dataset\n",
 61 |     "\n",
 62 |     "raw_datasets = load_dataset(\"conll2003\")\n",
 63 |     "raw_datasets = raw_datasets.remove_columns([\"chunk_tags\", \"id\", \"pos_tags\"])\n",
 64 |     "raw_datasets = raw_datasets.rename_column(\"ner_tags\", \"labels\")\n",
 65 |     "raw_datasets = raw_datasets.rename_column(\"tokens\", \"words\")\n",
 66 |     "raw_datasets[\"train\"]"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "print(raw_datasets[\"train\"][0][\"words\"])\n",
 76 |     "print(raw_datasets[\"train\"][0][\"labels\"])"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "label_names = raw_datasets[\"train\"].features[\"labels\"].feature.names\n",
 86 |     "label_names"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "from transformers import AutoTokenizer\n",
 96 |     "\n",
 97 |     "model_checkpoint = \"bert-base-cased\"\n",
 98 |     "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
 99 |     "\n",
100 |     "inputs = tokenizer(raw_datasets[\"train\"][0][\"words\"], is_split_into_words=True)\n",
101 |     "inputs.tokens()"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "def shift_label(label):\n",
111 |     "    # If the label is B-XXX we change it to I-XXX\n",
112 |     "    if label % 2 == 1:\n",
113 |     "        label += 1\n",
114 |     "    return label\n",
115 |     "\n",
116 |     "def align_labels_with_tokens(labels, word_ids):\n",
117 |     "    new_labels = []\n",
118 |     "    current_word = None\n",
119 |     "    for word_id in word_ids:\n",
120 |     "        if word_id is None:\n",
121 |     "            new_labels.append(-100)\n",
122 |     "        elif word_id != current_word:\n",
123 |     "            # Start of a new word!\n",
124 |     "            current_word = word_id\n",
125 |     "            new_labels.append(labels[word_id])\n",
126 |     "        else:\n",
127 |     "            new_labels.append(shift_label(labels[word_id]))\n",
128 |     "\n",
129 |     "    return new_labels"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "def tokenize_and_align_labels(examples):\n",
139 |     "    tokenized_inputs = tokenizer(examples[\"words\"], truncation=True, is_split_into_words=True)\n",
140 |     "    new_labels = []\n",
141 |     "    for i, labels in enumerate(examples[\"labels\"]):\n",
142 |     "        word_ids = tokenized_inputs.word_ids(i)\n",
143 |     "        new_labels.append(align_labels_with_tokens(labels, word_ids))\n",
144 |     "\n",
145 |     "    tokenized_inputs[\"labels\"] = new_labels\n",
146 |     "    return tokenized_inputs\n",
147 |     "\n",
148 |     "tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "from transformers import DataCollatorForTokenClassification\n",
158 |     "\n",
159 |     "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": []
168 |   }
169 |  ],
170 |  "metadata": {
171 |   "colab": {
172 |    "name": "Data processing for Token Classification",
173 |    "provenance": []
174 |   }
175 |  },
176 |  "nbformat": 4,
177 |  "nbformat_minor": 4
178 | }
179 | 


--------------------------------------------------------------------------------
/course/videos/translation_processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/XAR8jnZZuUs?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/XAR8jnZZuUs?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from datasets import load_dataset, load_metric\n",
 61 |     "\n",
 62 |     "raw_datasets = load_dataset(\"kde4\", lang1=\"en\", lang2=\"fr\")\n",
 63 |     "\n",
 64 |     "def extract_languages(examples):\n",
 65 |     "    inputs = [ex[\"en\"] for ex in examples[\"translation\"]]\n",
 66 |     "    targets = [ex[\"fr\"] for ex in examples[\"translation\"]]\n",
 67 |     "    return {\"inputs\": inputs, \"targets\": targets}\n",
 68 |     "\n",
 69 |     "raw_datasets = raw_datasets.map(extract_languages, batched=True, remove_columns=[\"id\", \"translation\"])\n",
 70 |     "raw_datasets"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "print(raw_datasets[\"train\"][10])\n",
 80 |     "print(raw_datasets[\"train\"][11])\n",
 81 |     "print(raw_datasets[\"train\"][12])"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "from transformers import AutoTokenizer\n",
 91 |     "\n",
 92 |     "model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n",
 93 |     "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
 94 |     "\n",
 95 |     "sample = raw_datasets[\"train\"][12]\n",
 96 |     "inputs = tokenizer(sample[\"inputs\"])\n",
 97 |     "targets = tokenizer(sample[\"targets\"])\n",
 98 |     "\n",
 99 |     "\n",
100 |     "print(tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"]))\n",
101 |     "print(tokenizer.convert_ids_to_tokens(targets[\"input_ids\"]))"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "from transformers import AutoTokenizer\n",
111 |     "\n",
112 |     "model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n",
113 |     "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
114 |     "\n",
115 |     "sample = raw_datasets[\"train\"][12]\n",
116 |     "inputs = tokenizer(sample[\"inputs\"])\n",
117 |     "with tokenizer.as_target_tokenizer():\n",
118 |     "    targets = tokenizer(sample[\"targets\"])\n",
119 |     "\n",
120 |     "\n",
121 |     "print(tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"]))\n",
122 |     "print(tokenizer.convert_ids_to_tokens(targets[\"input_ids\"]))"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "max_input_length = 128\n",
132 |     "max_target_length = 128\n",
133 |     "\n",
134 |     "def preprocess_function(examples):\n",
135 |     "    model_inputs = tokenizer(examples[\"inputs\"], max_length=max_input_length, truncation=True)\n",
136 |     "\n",
137 |     "    # Setup the tokenizer for targets\n",
138 |     "    with tokenizer.as_target_tokenizer():\n",
139 |     "        labels = tokenizer(examples[\"targets\"], max_length=max_target_length, truncation=True)\n",
140 |     "\n",
141 |     "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
142 |     "    return model_inputs\n",
143 |     "\n",
144 |     "tokenized_datasets = raw_datasets.map(\n",
145 |     "    preprocess_function, batched=True, remove_columns=[\"inputs\", \"targets\"]\n",
146 |     ")"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "from transformers import DataCollatorForSeq2Seq\n",
156 |     "\n",
157 |     "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": []
166 |   }
167 |  ],
168 |  "metadata": {
169 |   "colab": {
170 |    "name": "Data processing for Translation",
171 |    "provenance": []
172 |   }
173 |  },
174 |  "nbformat": 4,
175 |  "nbformat_minor": 4
176 | }
177 | 


--------------------------------------------------------------------------------
/sagemaker/09_image_classification_vision_transformer/scripts/train.py:
--------------------------------------------------------------------------------
  1 | from transformers import ViTForImageClassification, Trainer, TrainingArguments,default_data_collator,ViTFeatureExtractor
  2 | from datasets import load_from_disk,load_metric
  3 | import random
  4 | import logging
  5 | import sys
  6 | import argparse
  7 | import os
  8 | import numpy as np
  9 | import subprocess
 10 | 
 11 | subprocess.run([
 12 |         "git",
 13 |         "config",
 14 |         "--global",
 15 |         "user.email",
 16 |         "sagemaker@huggingface.co",
 17 |     ], check=True)
 18 | subprocess.run([
 19 |         "git",
 20 |         "config",
 21 |         "--global",
 22 |         "user.name",
 23 |         "sagemaker",
 24 |     ], check=True)
 25 | 
 26 | 
 27 | if __name__ == "__main__":
 28 | 
 29 |     parser = argparse.ArgumentParser()
 30 | 
 31 |     # hyperparameters sent by the client are passed as command-line arguments to the script.
 32 |     parser.add_argument("--model_name", type=str)
 33 |     parser.add_argument("--output_dir", type=str,default="/opt/ml/model")
 34 |     parser.add_argument("--extra_model_name", type=str,default="sagemaker")
 35 |     parser.add_argument("--dataset", type=str,default="cifar10")
 36 |     parser.add_argument("--task", type=str,default="image-classification")
 37 |     parser.add_argument("--use_auth_token", type=str, default="")
 38 | 
 39 |     parser.add_argument("--num_train_epochs", type=int, default=3)
 40 |     parser.add_argument("--per_device_train_batch_size", type=int, default=32)
 41 |     parser.add_argument("--per_device_eval_batch_size", type=int, default=64)
 42 |     parser.add_argument("--warmup_steps", type=int, default=500)
 43 |     parser.add_argument("--weight_decay", type=float, default=0.01)
 44 |     parser.add_argument("--learning_rate", type=str, default=2e-5)
 45 | 
 46 |     parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
 47 |     parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
 48 | 
 49 |     args, _ = parser.parse_known_args()
 50 | 
 51 |     # Set up logging
 52 |     logger = logging.getLogger(__name__)
 53 | 
 54 |     logging.basicConfig(
 55 |         level=logging.getLevelName("INFO"),
 56 |         handlers=[logging.StreamHandler(sys.stdout)],
 57 |         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 58 |     )
 59 | 
 60 |     # load datasets
 61 |     train_dataset = load_from_disk(args.training_dir)
 62 |     test_dataset = load_from_disk(args.test_dir)
 63 |     num_classes = train_dataset.features["label"].num_classes
 64 | 
 65 | 
 66 |     logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
 67 |     logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
 68 | 
 69 |     metric_name = "accuracy"
 70 |     # compute metrics function for binary classification
 71 | 
 72 |     metric = load_metric(metric_name)
 73 | 
 74 |     def compute_metrics(eval_pred):
 75 |         predictions, labels = eval_pred
 76 |         predictions = np.argmax(predictions, axis=1)
 77 |         return metric.compute(predictions=predictions, references=labels)
 78 | 
 79 |     # download model from model hub
 80 |     model = ViTForImageClassification.from_pretrained(args.model_name,num_labels=num_classes)
 81 |     
 82 |     # change labels
 83 |     id2label =  {key:train_dataset.features["label"].names[index] for index,key in enumerate(model.config.id2label.keys())}
 84 |     label2id =  {train_dataset.features["label"].names[index]:value for index,value in enumerate(model.config.label2id.values())}
 85 |     model.config.id2label = id2label
 86 |     model.config.label2id = label2id
 87 |     
 88 |     
 89 |     # define training args
 90 |     training_args = TrainingArguments(
 91 |         output_dir=args.output_dir,
 92 |         num_train_epochs=args.num_train_epochs,
 93 |         per_device_train_batch_size=args.per_device_train_batch_size,
 94 |         per_device_eval_batch_size=args.per_device_eval_batch_size,
 95 |         warmup_steps=args.warmup_steps,
 96 |         weight_decay=args.weight_decay,
 97 |         evaluation_strategy="epoch",
 98 |         logging_dir=f"{args.output_dir}/logs",
 99 |         learning_rate=float(args.learning_rate),
100 |         load_best_model_at_end=True,
101 |         metric_for_best_model=metric_name,
102 |     )
103 |     
104 |     
105 |     # create Trainer instance
106 |     trainer = Trainer(
107 |         model=model,
108 |         args=training_args,
109 |         compute_metrics=compute_metrics,
110 |         train_dataset=train_dataset,
111 |         eval_dataset=test_dataset,
112 |         data_collator=default_data_collator,
113 |     )
114 | 
115 |     # train model
116 |     trainer.train()
117 | 
118 |     # evaluate model
119 |     eval_result = trainer.evaluate(eval_dataset=test_dataset)
120 | 
121 |     # writes eval result to file which can be accessed later in s3 ouput
122 |     with open(os.path.join(args.output_dir, "eval_results.txt"), "w") as writer:
123 |         print(f"***** Eval results *****")
124 |         for key, value in sorted(eval_result.items()):
125 |             writer.write(f"{key} = {value}\n")
126 | 
127 |     # Saves the model to s3
128 |     trainer.save_model(args.output_dir)
129 | 
130 |     if args.use_auth_token != "":
131 |         kwargs = {
132 |             "finetuned_from": args.model_name.split("/")[1],
133 |             "tags": "image-classification",
134 |             "dataset": args.dataset,
135 |         }
136 |         repo_name = (
137 |             f"{args.model_name.split('/')[1]}-{args.task}"
138 |             if args.extra_model_name == ""
139 |             else f"{args.model_name.split('/')[1]}-{args.task}-{args.extra_model_name}"
140 |         )
141 |  
142 |         trainer.push_to_hub(
143 |             repo_name=repo_name,
144 |             use_auth_token=args.use_auth_token,
145 |             **kwargs,
146 |         )
147 | 


--------------------------------------------------------------------------------
/course/videos/sentence_pairs_tf.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/P-rZWqcB6CE?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/P-rZWqcB6CE?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from transformers import AutoTokenizer\n",
 61 |     "\n",
 62 |     "checkpoint = \"bert-base-uncased\"\n",
 63 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
 64 |     "sequences = [\n",
 65 |     "    \"I've been waiting for a HuggingFace course my whole life.\",\n",
 66 |     "    \"This course is amazing!\",\n",
 67 |     "]\n",
 68 |     "batch = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"tf\")"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/plain": [
 79 |        "{'input_ids': [101, 2026, 2171, 2003, 25353, 22144, 2378, 1012, 102, 1045, 2147, 2012, 17662, 2227, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
 80 |       ]
 81 |      },
 82 |      "execution_count": null,
 83 |      "metadata": {},
 84 |      "output_type": "execute_result"
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "from transformers import AutoTokenizer\n",
 89 |     "\n",
 90 |     "checkpoint = \"bert-base-uncased\"\n",
 91 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
 92 |     "tokenizer(\"My name is Sylvain.\", \"I work at Hugging Face.\")"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "data": {
102 |       "text/plain": [
103 |        "{'input_ids': [[101, 2026, 2171, 2003, 25353, 22144, 2378, 1012, 102, 1045, 2147, 2012, 17662, 2227, 1012, 102], [101, 2183, 2000, 1996, 5988, 1012, 102, 2023, 3185, 2003, 2307, 1012, 102, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]}"
104 |       ]
105 |      },
106 |      "execution_count": null,
107 |      "metadata": {},
108 |      "output_type": "execute_result"
109 |     }
110 |    ],
111 |    "source": [
112 |     "from transformers import AutoTokenizer\n",
113 |     "\n",
114 |     "checkpoint = \"bert-base-uncased\"\n",
115 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
116 |     "tokenizer(\n",
117 |     "    [\"My name is Sylvain.\", \"Going to the cinema.\"],\n",
118 |     "    [\"I work at Hugging Face.\", \"This movie is great.\"],\n",
119 |     "    padding=True\n",
120 |     ")"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "name": "stderr",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "All model checkpoint layers were used when initializing TFBertForSequenceClassification.\n",
133 |       "\n",
134 |       "Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']\n",
135 |       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
136 |      ]
137 |     }
138 |    ],
139 |    "source": [
140 |     "from transformers import TFAutoModelForSequenceClassification, AutoTokenizer\n",
141 |     "\n",
142 |     "checkpoint = \"bert-base-uncased\"\n",
143 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
144 |     "batch = tokenizer(\n",
145 |     "    [\"My name is Sylvain.\", \"Going to the cinema.\"],\n",
146 |     "    [\"I work at Hugging Face.\", \"This movie is great.\"],\n",
147 |     "    padding=True,\n",
148 |     "    return_tensors=\"tf\",\n",
149 |     ")\n",
150 |     "\n",
151 |     "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
152 |     "outputs = model(**batch)"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": []
161 |   }
162 |  ],
163 |  "metadata": {
164 |   "colab": {
165 |    "name": "Preprocessing sentence pairs (TensorFlow)",
166 |    "provenance": []
167 |   }
168 |  },
169 |  "nbformat": 4,
170 |  "nbformat_minor": 4
171 | }
172 | 


--------------------------------------------------------------------------------
/course/videos/tensorflow_finetuning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/AUozVp78dhk?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/AUozVp78dhk?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {
 65 |     "cellView": "form"
 66 |    },
 67 |    "outputs": [
 68 |     {
 69 |      "data": {
 70 |       "text/html": [
 71 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/alq1l8Lv9GA?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 72 |       ],
 73 |       "text/plain": [
 74 |        "<IPython.core.display.HTML object>"
 75 |       ]
 76 |      },
 77 |      "execution_count": null,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "#@title\n",
 84 |     "from IPython.display import HTML\n",
 85 |     "\n",
 86 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/alq1l8Lv9GA?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "Install the Transformers and Datasets libraries to run this notebook."
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "! pip install datasets transformers[sentencepiece]"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "from datasets import load_dataset\n",
112 |     "from transformers import AutoTokenizer\n",
113 |     "import numpy as np\n",
114 |     "\n",
115 |     "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
116 |     "checkpoint = \"bert-base-uncased\"\n",
117 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
118 |     "\n",
119 |     "def tokenize_dataset(dataset):\n",
120 |     "    encoded = tokenizer(\n",
121 |     "        dataset[\"sentence1\"],\n",
122 |     "        dataset[\"sentence2\"],\n",
123 |     "        max_length=128,\n",
124 |     "        truncation=True,\n",
125 |     "    )\n",
126 |     "    return encoded.data\n",
127 |     "\n",
128 |     "tokenized_datasets = raw_datasets.map(tokenize_dataset, batched=True)\n",
129 |     "\n",
130 |     "train_dataset = tokenized_datasets[\"train\"].to_tf_dataset(\n",
131 |     "    columns=[\"input_ids\", \"attention_mask\", \"token_type_ids\"],\n",
132 |     "    label_cols=[\"label\"],\n",
133 |     "    shuffle=True,\n",
134 |     "    batch_size=8)\n",
135 |     "\n",
136 |     "validation_dataset = tokenized_datasets[\"validation\"].to_tf_dataset(\n",
137 |     "    columns=[\"input_ids\", \"attention_mask\", \"token_type_ids\"],\n",
138 |     "    label_cols=[\"label\"],\n",
139 |     "    shuffle=True,\n",
140 |     "    batch_size=8)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "next(iter(train_dataset))[1]"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "import tensorflow as tf\n",
159 |     "from transformers import TFAutoModelForSequenceClassification\n",
160 |     "\n",
161 |     "checkpoint = 'bert-base-cased'\n",
162 |     "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
163 |     "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
164 |     "model.compile(optimizer='adam', loss=loss)"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "model.fit(\n",
174 |     "    train_dataset,\n",
175 |     "    validation_data=validation_dataset,\n",
176 |     "    epochs=3\n",
177 |     ")"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": []
186 |   }
187 |  ],
188 |  "metadata": {
189 |   "colab": {
190 |    "name": "Fine-Tuning with TensorFlow",
191 |    "provenance": []
192 |   }
193 |  },
194 |  "nbformat": 4,
195 |  "nbformat_minor": 4
196 | }
197 | 


--------------------------------------------------------------------------------
/course/chapter3/section3_tf.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Fine-tuning a model with Keras"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Install the Transformers and Datasets libraries to run this notebook."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "!pip install datasets transformers[sentencepiece]"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from datasets import load_dataset\n",
 33 |     "from transformers import AutoTokenizer, DataCollatorWithPadding\n",
 34 |     "import numpy as np\n",
 35 |     "\n",
 36 |     "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
 37 |     "checkpoint = \"bert-base-uncased\"\n",
 38 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
 39 |     "\n",
 40 |     "\n",
 41 |     "def tokenize_function(example):\n",
 42 |     "    return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n",
 43 |     "\n",
 44 |     "\n",
 45 |     "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
 46 |     "\n",
 47 |     "data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors=\"tf\")\n",
 48 |     "\n",
 49 |     "tf_train_dataset = tokenized_datasets[\"train\"].to_tf_dataset(\n",
 50 |     "    columns=[\"attention_mask\", \"input_ids\", \"token_type_ids\"],\n",
 51 |     "    label_cols=[\"labels\"],\n",
 52 |     "    shuffle=True,\n",
 53 |     "    collate_fn=data_collator,\n",
 54 |     "    batch_size=8,\n",
 55 |     ")\n",
 56 |     "\n",
 57 |     "tf_validation_dataset = tokenized_datasets[\"validation\"].to_tf_dataset(\n",
 58 |     "    columns=[\"attention_mask\", \"input_ids\", \"token_type_ids\"],\n",
 59 |     "    label_cols=[\"labels\"],\n",
 60 |     "    shuffle=False,\n",
 61 |     "    collate_fn=data_collator,\n",
 62 |     "    batch_size=8,\n",
 63 |     ")"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "from transformers import TFAutoModelForSequenceClassification\n",
 73 |     "\n",
 74 |     "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "from tensorflow.keras.losses import SparseCategoricalCrossentropy\n",
 84 |     "\n",
 85 |     "model.compile(\n",
 86 |     "    optimizer=\"adam\",\n",
 87 |     "    loss=SparseCategoricalCrossentropy(from_logits=True),\n",
 88 |     "    metrics=[\"accuracy\"],\n",
 89 |     ")\n",
 90 |     "model.fit(\n",
 91 |     "    tf_train_dataset,\n",
 92 |     "    validation_data=tf_validation_dataset,\n",
 93 |     ")"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "from tensorflow.keras.optimizers.schedules import PolynomialDecay\n",
103 |     "\n",
104 |     "batch_size = 8\n",
105 |     "num_epochs = 3\n",
106 |     "# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied\n",
107 |     "# by the total number of epochs\n",
108 |     "num_train_steps = len(tf_train_dataset) * num_epochs\n",
109 |     "lr_scheduler = PolynomialDecay(\n",
110 |     "    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps\n",
111 |     ")\n",
112 |     "from tensorflow.keras.optimizers import Adam\n",
113 |     "\n",
114 |     "opt = Adam(learning_rate=lr_scheduler)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "import tensorflow as tf\n",
124 |     "\n",
125 |     "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
126 |     "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
127 |     "model.compile(optimizer=opt, loss=loss, metrics=[\"accuracy\"])"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "preds = model.predict(tf_validation_dataset)[\"logits\"]"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "data": {
155 |       "text/plain": [
156 |        "(408, 2) (408,)"
157 |       ]
158 |      },
159 |      "execution_count": null,
160 |      "metadata": {},
161 |      "output_type": "execute_result"
162 |     }
163 |    ],
164 |    "source": [
165 |     "class_preds = np.argmax(preds, axis=1)\n",
166 |     "print(preds.shape, class_preds.shape)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [
174 |     {
175 |      "data": {
176 |       "text/plain": [
177 |        "{'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}"
178 |       ]
179 |      },
180 |      "execution_count": null,
181 |      "metadata": {},
182 |      "output_type": "execute_result"
183 |     }
184 |    ],
185 |    "source": [
186 |     "from datasets import load_metric\n",
187 |     "\n",
188 |     "metric = load_metric(\"glue\", \"mrpc\")\n",
189 |     "metric.compute(predictions=class_preds, references=raw_datasets[\"validation\"][\"label\"])"
190 |    ]
191 |   }
192 |  ],
193 |  "metadata": {
194 |   "colab": {
195 |    "name": "Fine-tuning a model with Keras",
196 |    "provenance": []
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 4
201 | }
202 | 


--------------------------------------------------------------------------------
/course/videos/semantic_search.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/OATCgQtNX2o?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/OATCgQtNX2o?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "import torch\n",
 61 |     "from transformers import AutoTokenizer, AutoModel\n",
 62 |     "\n",
 63 |     "sentences = [\n",
 64 |     "    \"I took my dog for a walk\",\n",
 65 |     "    \"Today is going to rain\",\n",
 66 |     "    \"I took my cat for a walk\",\n",
 67 |     "]\n",
 68 |     "\n",
 69 |     "model_ckpt = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
 70 |     "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
 71 |     "model = AutoModel.from_pretrained(model_ckpt)\n",
 72 |     "\n",
 73 |     "encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors=\"pt\")\n",
 74 |     "\n",
 75 |     "with torch.no_grad():\n",
 76 |     "    model_output = model(**encoded_input)\n",
 77 |     "    \n",
 78 |     "    \n",
 79 |     "token_embeddings = model_output.last_hidden_state\n",
 80 |     "print(f\"Token embeddings shape: {token_embeddings.size()}\")"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "import torch.nn.functional as F\n",
 90 |     "\n",
 91 |     "\n",
 92 |     "def mean_pooling(model_output, attention_mask):\n",
 93 |     "    token_embeddings = model_output.last_hidden_state\n",
 94 |     "    input_mask_expanded = (\n",
 95 |     "        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n",
 96 |     "    )\n",
 97 |     "    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(\n",
 98 |     "        input_mask_expanded.sum(1), min=1e-9\n",
 99 |     "    )\n",
100 |     "\n",
101 |     "\n",
102 |     "sentence_embeddings = mean_pooling(model_output, encoded_input[\"attention_mask\"])\n",
103 |     "# Normalize the embeddings\n",
104 |     "sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)\n",
105 |     "print(f\"Sentence embeddings shape: {sentence_embeddings.size()}\")"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "import numpy as np\n",
115 |     "from sklearn.metrics.pairwise import cosine_similarity\n",
116 |     "\n",
117 |     "sentence_embeddings = sentence_embeddings.detach().numpy()\n",
118 |     "\n",
119 |     "scores = np.zeros((sentence_embeddings.shape[0], sentence_embeddings.shape[0]))\n",
120 |     "\n",
121 |     "for idx in range(sentence_embeddings.shape[0]):\n",
122 |     "    scores[idx, :] = cosine_similarity([sentence_embeddings[idx]], sentence_embeddings)[0]"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "from datasets import load_dataset\n",
132 |     "\n",
133 |     "squad = load_dataset(\"squad\", split=\"validation\").shuffle(seed=42).select(range(100))\n",
134 |     "\n",
135 |     "\n",
136 |     "def get_embeddings(text_list):\n",
137 |     "    encoded_input = tokenizer(\n",
138 |     "        text_list, padding=True, truncation=True, return_tensors=\"pt\"\n",
139 |     "    )\n",
140 |     "    encoded_input = {k: v for k, v in encoded_input.items()}\n",
141 |     "    with torch.no_grad():\n",
142 |     "        model_output = model(**encoded_input)\n",
143 |     "    return mean_pooling(model_output, encoded_input[\"attention_mask\"])\n",
144 |     "\n",
145 |     "\n",
146 |     "squad_with_embeddings = squad.map(\n",
147 |     "    lambda x: {\"embeddings\": get_embeddings(x[\"context\"]).cpu().numpy()[0]}\n",
148 |     ")"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "squad_with_embeddings.add_faiss_index(column=\"embeddings\")\n",
158 |     "\n",
159 |     "question = \"Who headlined the halftime show for Super Bowl 50?\"\n",
160 |     "question_embedding = get_embeddings([question]).cpu().detach().numpy()\n",
161 |     "\n",
162 |     "scores, samples = squad_with_embeddings.get_nearest_examples(\n",
163 |     "    \"embeddings\", question_embedding, k=3\n",
164 |     ")"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": []
173 |   }
174 |  ],
175 |  "metadata": {
176 |   "colab": {
177 |    "name": "Text embeddings & semantic search",
178 |    "provenance": []
179 |   }
180 |  },
181 |  "nbformat": 4,
182 |  "nbformat_minor": 4
183 | }
184 | 


--------------------------------------------------------------------------------
/course/chapter2/section6_pt.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Putting it all together (PyTorch)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Install the Transformers and Datasets libraries to run this notebook."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "!pip install datasets transformers[sentencepiece]"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from transformers import AutoTokenizer\n",
 33 |     "\n",
 34 |     "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
 35 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
 36 |     "\n",
 37 |     "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
 38 |     "\n",
 39 |     "model_inputs = tokenizer(sequence)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
 49 |     "\n",
 50 |     "model_inputs = tokenizer(sequence)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
 60 |     "\n",
 61 |     "model_inputs = tokenizer(sequences)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# Will pad the sequences up to the maximum sequence length\n",
 71 |     "model_inputs = tokenizer(sequences, padding=\"longest\")\n",
 72 |     "\n",
 73 |     "# Will pad the sequences up to the model max length\n",
 74 |     "# (512 for BERT or DistilBERT)\n",
 75 |     "model_inputs = tokenizer(sequences, padding=\"max_length\")\n",
 76 |     "\n",
 77 |     "# Will pad the sequences up to the specified max length\n",
 78 |     "model_inputs = tokenizer(sequences, padding=\"max_length\", max_length=8)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
 88 |     "\n",
 89 |     "# Will truncate the sequences that are longer than the model max length\n",
 90 |     "# (512 for BERT or DistilBERT)\n",
 91 |     "model_inputs = tokenizer(sequences, truncation=True)\n",
 92 |     "\n",
 93 |     "# Will truncate the sequences that are longer than the specified max length\n",
 94 |     "model_inputs = tokenizer(sequences, max_length=8, truncation=True)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
104 |     "\n",
105 |     "# Returns PyTorch tensors\n",
106 |     "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"pt\")\n",
107 |     "\n",
108 |     "# Returns TensorFlow tensors\n",
109 |     "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"tf\")\n",
110 |     "\n",
111 |     "# Returns NumPy arrays\n",
112 |     "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"np\")"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]\n",
124 |        "[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]"
125 |       ]
126 |      },
127 |      "execution_count": null,
128 |      "metadata": {},
129 |      "output_type": "execute_result"
130 |     }
131 |    ],
132 |    "source": [
133 |     "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
134 |     "\n",
135 |     "model_inputs = tokenizer(sequence)\n",
136 |     "print(model_inputs[\"input_ids\"])\n",
137 |     "\n",
138 |     "tokens = tokenizer.tokenize(sequence)\n",
139 |     "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
140 |     "print(ids)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "data": {
150 |       "text/plain": [
151 |        "\"[CLS] i've been waiting for a huggingface course my whole life. [SEP]\"\n",
152 |        "\"i've been waiting for a huggingface course my whole life.\""
153 |       ]
154 |      },
155 |      "execution_count": null,
156 |      "metadata": {},
157 |      "output_type": "execute_result"
158 |     }
159 |    ],
160 |    "source": [
161 |     "print(tokenizer.decode(model_inputs[\"input_ids\"]))\n",
162 |     "print(tokenizer.decode(ids))"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "import torch\n",
172 |     "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
173 |     "\n",
174 |     "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
175 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
176 |     "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
177 |     "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
178 |     "\n",
179 |     "tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"pt\")\n",
180 |     "output = model(**tokens)"
181 |    ]
182 |   }
183 |  ],
184 |  "metadata": {
185 |   "colab": {
186 |    "name": "Putting it all together (PyTorch)",
187 |    "provenance": []
188 |   }
189 |  },
190 |  "nbformat": 4,
191 |  "nbformat_minor": 4
192 | }
193 | 


--------------------------------------------------------------------------------
/course/chapter2/section6_tf.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Putting it all together (TensorFlow)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Install the Transformers and Datasets libraries to run this notebook."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "!pip install datasets transformers[sentencepiece]"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from transformers import AutoTokenizer\n",
 33 |     "\n",
 34 |     "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
 35 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
 36 |     "\n",
 37 |     "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
 38 |     "\n",
 39 |     "model_inputs = tokenizer(sequence)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
 49 |     "\n",
 50 |     "model_inputs = tokenizer(sequence)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
 60 |     "\n",
 61 |     "model_inputs = tokenizer(sequences)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# Will pad the sequences up to the maximum sequence length\n",
 71 |     "model_inputs = tokenizer(sequences, padding=\"longest\")\n",
 72 |     "\n",
 73 |     "# Will pad the sequences up to the model max length\n",
 74 |     "# (512 for BERT or DistilBERT)\n",
 75 |     "model_inputs = tokenizer(sequences, padding=\"max_length\")\n",
 76 |     "\n",
 77 |     "# Will pad the sequences up to the specified max length\n",
 78 |     "model_inputs = tokenizer(sequences, padding=\"max_length\", max_length=8)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
 88 |     "\n",
 89 |     "# Will truncate the sequences that are longer than the model max length\n",
 90 |     "# (512 for BERT or DistilBERT)\n",
 91 |     "model_inputs = tokenizer(sequences, truncation=True)\n",
 92 |     "\n",
 93 |     "# Will truncate the sequences that are longer than the specified max length\n",
 94 |     "model_inputs = tokenizer(sequences, max_length=8, truncation=True)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
104 |     "\n",
105 |     "# Returns PyTorch tensors\n",
106 |     "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"pt\")\n",
107 |     "\n",
108 |     "# Returns TensorFlow tensors\n",
109 |     "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"tf\")\n",
110 |     "\n",
111 |     "# Returns NumPy arrays\n",
112 |     "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"np\")"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]\n",
124 |        "[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]"
125 |       ]
126 |      },
127 |      "execution_count": null,
128 |      "metadata": {},
129 |      "output_type": "execute_result"
130 |     }
131 |    ],
132 |    "source": [
133 |     "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
134 |     "\n",
135 |     "model_inputs = tokenizer(sequence)\n",
136 |     "print(model_inputs[\"input_ids\"])\n",
137 |     "\n",
138 |     "tokens = tokenizer.tokenize(sequence)\n",
139 |     "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
140 |     "print(ids)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "data": {
150 |       "text/plain": [
151 |        "\"[CLS] i've been waiting for a huggingface course my whole life. [SEP]\"\n",
152 |        "\"i've been waiting for a huggingface course my whole life.\""
153 |       ]
154 |      },
155 |      "execution_count": null,
156 |      "metadata": {},
157 |      "output_type": "execute_result"
158 |     }
159 |    ],
160 |    "source": [
161 |     "print(tokenizer.decode(model_inputs[\"input_ids\"]))\n",
162 |     "print(tokenizer.decode(ids))"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "import tensorflow as tf\n",
172 |     "from transformers import AutoTokenizer, TFAutoModelForSequenceClassification\n",
173 |     "\n",
174 |     "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
175 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
176 |     "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
177 |     "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
178 |     "\n",
179 |     "tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"tf\")\n",
180 |     "output = model(**tokens)"
181 |    ]
182 |   }
183 |  ],
184 |  "metadata": {
185 |   "colab": {
186 |    "name": "Putting it all together (TensorFlow)",
187 |    "provenance": []
188 |   }
189 |  },
190 |  "nbformat": 4,
191 |  "nbformat_minor": 4
192 | }
193 | 


--------------------------------------------------------------------------------
/course/videos/token_pipeline_pt.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "cellView": "form"
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/html": [
 20 |        "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/0E7ltQB7fM8?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
 21 |       ],
 22 |       "text/plain": [
 23 |        "<IPython.core.display.HTML object>"
 24 |       ]
 25 |      },
 26 |      "execution_count": null,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "#@title\n",
 33 |     "from IPython.display import HTML\n",
 34 |     "\n",
 35 |     "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/0E7ltQB7fM8?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Install the Transformers and Datasets libraries to run this notebook."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "! pip install datasets transformers[sentencepiece]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from transformers import pipeline\n",
 61 |     "\n",
 62 |     "token_classifier = pipeline(\"token-classification\")\n",
 63 |     "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "token_classifier = pipeline(\"token-classification\", aggregation_strategy=\"simple\")\n",
 73 |     "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "from transformers import AutoTokenizer, AutoModelForTokenClassification\n",
 83 |     "\n",
 84 |     "model_checkpoint = \"\"\n",
 85 |     "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
 86 |     "model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)\n",
 87 |     "\n",
 88 |     "example = \"My name is Sylvain and I work at Hugging Face in Brooklyn.\"\n",
 89 |     "inputs = tokenizer(example, return_tensors=\"pt\")\n",
 90 |     "outputs = model(**inputs)\n",
 91 |     "\n",
 92 |     "print(inputs[\"input_ids\"].shape)\n",
 93 |     "print(outputs.logits.shape)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "import torch\n",
103 |     "\n",
104 |     "probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()\n",
105 |     "predictions = probabilities.argmax(dim=-1)[0].tolist()\n",
106 |     "print(predictions)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "model.config.id2label"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "results = []\n",
125 |     "inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)\n",
126 |     "tokens = inputs_with_offsets.tokens()\n",
127 |     "offsets = inputs_with_offsets[\"offset_mapping\"]\n",
128 |     "\n",
129 |     "for idx, pred in enumerate(predictions):\n",
130 |     "    label = model.config.id2label[pred]\n",
131 |     "    if label != \"O\":\n",
132 |     "        start, end = offsets[idx]\n",
133 |     "        results.append(\n",
134 |     "            {\"entity\": label, \"score\": probabilities[idx][pred],\n",
135 |     "             \"word\": tokens[idx], \"start\": start, \"end\": end}\n",
136 |     "        )\n",
137 |     "\n",
138 |     "print(results)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "import numpy as np\n",
148 |     "\n",
149 |     "label_map = model.config.id2label\n",
150 |     "results = []\n",
151 |     "idx = 0\n",
152 |     "while idx < len(predictions):\n",
153 |     "    pred = predictions[idx]\n",
154 |     "    label = label_map[pred]\n",
155 |     "    if label != \"O\":\n",
156 |     "        # Remove the B- or I-\n",
157 |     "        label = label[2:]\n",
158 |     "        start, _ = offsets[idx]\n",
159 |     "\n",
160 |     "        # Grab all the tokens labeled with I-label\n",
161 |     "        all_scores = []\n",
162 |     "        while idx < len(predictions) and label_map[predictions[idx]] == f\"I-{label}\":\n",
163 |     "            all_scores.append(probabilities[idx][pred])\n",
164 |     "            _, end = offsets[idx]\n",
165 |     "            idx += 1\n",
166 |     "\n",
167 |     "        # The score is the mean of all the scores of the token in that grouped entity.\n",
168 |     "        score = np.mean(all_scores).item()\n",
169 |     "        word = example[start:end]\n",
170 |     "        results.append(\n",
171 |     "            {\"entity_group\": label, \"score\": score,\n",
172 |     "             \"word\": word, \"start\": start, \"end\": end}\n",
173 |     "        )\n",
174 |     "    idx += 1"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": []
183 |   }
184 |  ],
185 |  "metadata": {
186 |   "colab": {
187 |    "name": "Inside the Token classification pipeline (PyTorch)",
188 |    "provenance": []
189 |   }
190 |  },
191 |  "nbformat": 4,
192 |  "nbformat_minor": 4
193 | }
194 | 


--------------------------------------------------------------------------------