├── .gitignore
├── README.md
├── sagemaker
├── 14_train_and_push_to_hub
│ ├── README.md
│ ├── imgs
│ │ └── emotion-widget.png
│ └── scripts
│ │ └── train.py
├── 15_training_compiler
│ ├── imgs
│ │ └── emotion-widget.png
│ └── scripts
│ │ └── train.py
├── 13_deploy_and_autoscaling_transformers
│ └── imgs
│ │ ├── sm-endpoint.png
│ │ ├── scaling-options.jpeg
│ │ ├── autoscaling-endpoint.png
│ │ ├── hf-inference-toolkit.png
│ │ └── model-monitoring-dashboard.png
├── 01_getting_started_pytorch
│ └── scripts
│ │ └── train.py
├── 06_sagemaker_metrics
│ └── scripts
│ │ └── train.py
├── 05_spot_instances
│ └── scripts
│ │ └── train.py
├── 02_getting_started_tensorflow
│ └── scripts
│ │ └── train.py
└── 09_image_classification_vision_transformer
│ └── scripts
│ └── train.py
├── Makefile
├── examples
└── images
│ ├── translation.png
│ ├── summarization.png
│ ├── model_parameters.png
│ ├── question_answering.png
│ ├── text_classification.png
│ ├── token_classification.png
│ ├── causal_language_modeling.png
│ └── masked_language_modeling.png
├── longform-qa
└── images
│ ├── fireworks.gif
│ ├── ELI5animation.gif
│ └── huggingface_logo.jpg
├── transformers_doc
├── imgs
│ ├── ppl_full.gif
│ ├── ppl_chunked.gif
│ └── ppl_sliding.gif
└── README.md
└── course
├── chapter8
├── section5.ipynb
└── section3.ipynb
├── chapter1
└── section8.ipynb
├── videos
├── pre_tokenization.ipynb
├── rouge_metric.ipynb
├── perplexity.ipynb
├── normalization.ipynb
├── offset_mapping.ipynb
├── domain_adaptation.ipynb
├── bleu_metric.ipynb
├── datasets_and_dataframes.ipynb
├── fast_tokenizers.ipynb
├── debug_error.ipynb
├── summarization_processing.ipynb
├── clm_processing.ipynb
├── load_custom_dataset.ipynb
├── debug_training_tf.ipynb
├── save_load_dataset.ipynb
├── mlm_processing.ipynb
├── building_tokenizer.ipynb
├── memory_mapping_streaming.ipynb
├── train_new_tokenizer.ipynb
├── slice_and_dice.ipynb
├── custom_loss.ipynb
├── token_processing.ipynb
├── translation_processing.ipynb
├── sentence_pairs_tf.ipynb
├── tensorflow_finetuning.ipynb
├── semantic_search.ipynb
└── token_pipeline_pt.ipynb
├── chapter4
├── section2_pt.ipynb
└── section2_tf.ipynb
├── chapter2
├── section3_pt.ipynb
├── section3_tf.ipynb
├── section4_pt.ipynb
├── section4_tf.ipynb
├── section6_pt.ipynb
└── section6_tf.ipynb
├── chapter6
└── section4.ipynb
├── chapter5
└── section2.ipynb
└── chapter3
├── section3.ipynb
└── section3_tf.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | # Jupyter Notebook
2 | .ipynb_checkpoints
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # notebooks
2 | Notebooks using the Hugging Face libraries 🤗
3 |
--------------------------------------------------------------------------------
/sagemaker/14_train_and_push_to_hub/README.md:
--------------------------------------------------------------------------------
1 | # SageMaker push to hf.co/models example
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: doc-notebooks
2 |
3 | doc-notebooks:
4 | python utils/convert_doc_to_notebooks.py
5 |
--------------------------------------------------------------------------------
/examples/images/translation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/translation.png
--------------------------------------------------------------------------------
/longform-qa/images/fireworks.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/longform-qa/images/fireworks.gif
--------------------------------------------------------------------------------
/examples/images/summarization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/summarization.png
--------------------------------------------------------------------------------
/transformers_doc/imgs/ppl_full.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/transformers_doc/imgs/ppl_full.gif
--------------------------------------------------------------------------------
/examples/images/model_parameters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/model_parameters.png
--------------------------------------------------------------------------------
/longform-qa/images/ELI5animation.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/longform-qa/images/ELI5animation.gif
--------------------------------------------------------------------------------
/transformers_doc/imgs/ppl_chunked.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/transformers_doc/imgs/ppl_chunked.gif
--------------------------------------------------------------------------------
/transformers_doc/imgs/ppl_sliding.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/transformers_doc/imgs/ppl_sliding.gif
--------------------------------------------------------------------------------
/examples/images/question_answering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/question_answering.png
--------------------------------------------------------------------------------
/examples/images/text_classification.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/text_classification.png
--------------------------------------------------------------------------------
/longform-qa/images/huggingface_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/longform-qa/images/huggingface_logo.jpg
--------------------------------------------------------------------------------
/examples/images/token_classification.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/token_classification.png
--------------------------------------------------------------------------------
/examples/images/causal_language_modeling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/causal_language_modeling.png
--------------------------------------------------------------------------------
/examples/images/masked_language_modeling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/masked_language_modeling.png
--------------------------------------------------------------------------------
/sagemaker/15_training_compiler/imgs/emotion-widget.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/15_training_compiler/imgs/emotion-widget.png
--------------------------------------------------------------------------------
/sagemaker/14_train_and_push_to_hub/imgs/emotion-widget.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/14_train_and_push_to_hub/imgs/emotion-widget.png
--------------------------------------------------------------------------------
/sagemaker/13_deploy_and_autoscaling_transformers/imgs/sm-endpoint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/sm-endpoint.png
--------------------------------------------------------------------------------
/sagemaker/13_deploy_and_autoscaling_transformers/imgs/scaling-options.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/scaling-options.jpeg
--------------------------------------------------------------------------------
/sagemaker/13_deploy_and_autoscaling_transformers/imgs/autoscaling-endpoint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/autoscaling-endpoint.png
--------------------------------------------------------------------------------
/sagemaker/13_deploy_and_autoscaling_transformers/imgs/hf-inference-toolkit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/hf-inference-toolkit.png
--------------------------------------------------------------------------------
/sagemaker/13_deploy_and_autoscaling_transformers/imgs/model-monitoring-dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/model-monitoring-dashboard.png
--------------------------------------------------------------------------------
/transformers_doc/README.md:
--------------------------------------------------------------------------------
1 | # 🤗 Transformers doc notebooks
2 |
3 | These notebooks are automatically generated from the [🤗 Transformers documentation](https://huggingface.co/transformers/)
4 | so you should not make any direct modification here. If there is a typo to fix or a sentence to add, open a pull
5 | request in the [🤗 Transformers repo](https://github.com/huggingface/transformers) and fix the corresponding file in
6 | the `docs/source/` folder.
7 |
8 | If there is something that seems weirdly converted from the original doc file, open an issue in this repo and we will
9 | try to fix the conversion script.
--------------------------------------------------------------------------------
/course/chapter8/section5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# How to write a good issue"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": []
32 | }
33 | ],
34 | "metadata": {
35 | "colab": {
36 | "name": "How to write a good issue",
37 | "provenance": []
38 | }
39 | },
40 | "nbformat": 4,
41 | "nbformat_minor": 4
42 | }
43 |
--------------------------------------------------------------------------------
/course/chapter1/section8.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Bias and limitations"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "data": {
33 | "text/plain": [
34 | "['lawyer', 'carpenter', 'doctor', 'waiter', 'mechanic']\n",
35 | "['nurse', 'waitress', 'teacher', 'maid', 'prostitute']"
36 | ]
37 | },
38 | "execution_count": null,
39 | "metadata": {},
40 | "output_type": "execute_result"
41 | }
42 | ],
43 | "source": [
44 | "from transformers import pipeline\n",
45 | "\n",
46 | "unmasker = pipeline(\"fill-mask\", model=\"bert-base-uncased\")\n",
47 | "result = unmasker(\"This man works as a [MASK].\")\n",
48 | "print([r[\"token_str\"] for r in result])\n",
49 | "\n",
50 | "result = unmasker(\"This woman works as a [MASK].\")\n",
51 | "print([r[\"token_str\"] for r in result])"
52 | ]
53 | }
54 | ],
55 | "metadata": {
56 | "colab": {
57 | "name": "Bias and limitations",
58 | "provenance": []
59 | }
60 | },
61 | "nbformat": 4,
62 | "nbformat_minor": 4
63 | }
64 |
--------------------------------------------------------------------------------
/course/videos/pre_tokenization.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from transformers import AutoTokenizerFast\n",
61 | "\n",
62 | "tokenizer = AutoTokenizerFast.from_pretrained('albert-base-v1’)\n",
63 | "\n",
64 | "text = \"3.2.1: let's get started!\"\n",
65 | "\n",
66 | "print(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text))"
67 | ]
68 | }
69 | ],
70 | "metadata": {
71 | "colab": {
72 | "name": "What is pre-tokenization?",
73 | "provenance": []
74 | }
75 | },
76 | "nbformat": 4,
77 | "nbformat_minor": 4
78 | }
79 |
--------------------------------------------------------------------------------
/course/videos/rouge_metric.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "! pip install nltk rouge_score"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "from datasets import load_metric\n",
70 | "\n",
71 | "rouge = load_metric(\"rouge\")\n",
72 | "predictions = [\"I really loved reading the Hunger Games\"]\n",
73 | "references = [\"I loved reading the Hunger Games\"]\n",
74 | "rouge.compute(predictions=predictions, references=references)"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": []
83 | }
84 | ],
85 | "metadata": {
86 | "colab": {
87 | "name": "What is the ROUGE metric?",
88 | "provenance": []
89 | }
90 | },
91 | "nbformat": 4,
92 | "nbformat_minor": 4
93 | }
94 |
--------------------------------------------------------------------------------
/course/videos/perplexity.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
61 | "import torch\n",
62 | "\n",
63 | "model = AutoModelForCausalLM.from_pretrained(\"gpt2\")\n",
64 | "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
65 | "\n",
66 | "inputs = tokenizer(\"Hugging Face is a startup based in New York City and Paris\",\n",
67 | " return_tensors=\"pt\")\n",
68 | "\n",
69 | "loss = model(input_ids=inputs[\"input_ids\"],\n",
70 | " labels=inputs[\"input_ids\"]).loss\n",
71 | "\n",
72 | "ppl = torch.exp(loss)\n",
73 | "\n",
74 | "print(f\"Perplexity: {ppl.item():.2f}\")"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": []
83 | }
84 | ],
85 | "metadata": {
86 | "colab": {
87 | "name": "What is perplexity?",
88 | "provenance": []
89 | }
90 | },
91 | "nbformat": 4,
92 | "nbformat_minor": 4
93 | }
94 |
--------------------------------------------------------------------------------
/course/videos/normalization.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from transformers import AutoTokenizer\n",
61 | "\n",
62 | "text = \"This is a text with àccënts and CAPITAL LETTERS\"\n",
63 | "\n",
64 | "tokenizer = AutoTokenizer.from_pretrained(\"albert-large-v2\")\n",
65 | "print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))\n",
66 | "\n",
67 | "tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/albert-tokenizer-without-normalizer\")\n",
68 | "print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "text = \"un père indigné\"\n",
78 | "\n",
79 | "tokenizer = AutoTokenizerFast.from_pretrained('distilbert-base-uncased')\n",
80 | "print(tokenizer.backend_tokenizer.normalizer.normalize_str(text))"
81 | ]
82 | }
83 | ],
84 | "metadata": {
85 | "colab": {
86 | "name": "What is normalization?",
87 | "provenance": []
88 | }
89 | },
90 | "nbformat": 4,
91 | "nbformat_minor": 4
92 | }
93 |
--------------------------------------------------------------------------------
/course/chapter4/section2_pt.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Using pretrained models (PyTorch)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "data": {
33 | "text/plain": [
34 | "[\n",
35 | " {'sequence': 'Le camembert est délicieux :)', 'score': 0.49091005325317383, 'token': 7200, 'token_str': 'délicieux'}, \n",
36 | " {'sequence': 'Le camembert est excellent :)', 'score': 0.1055697426199913, 'token': 2183, 'token_str': 'excellent'}, \n",
37 | " {'sequence': 'Le camembert est succulent :)', 'score': 0.03453313186764717, 'token': 26202, 'token_str': 'succulent'}, \n",
38 | " {'sequence': 'Le camembert est meilleur :)', 'score': 0.0330314114689827, 'token': 528, 'token_str': 'meilleur'}, \n",
39 | " {'sequence': 'Le camembert est parfait :)', 'score': 0.03007650189101696, 'token': 1654, 'token_str': 'parfait'}\n",
40 | "]"
41 | ]
42 | },
43 | "execution_count": null,
44 | "metadata": {},
45 | "output_type": "execute_result"
46 | }
47 | ],
48 | "source": [
49 | "from transformers import pipeline\n",
50 | "\n",
51 | "camembert_fill_mask = pipeline(\"fill-mask\", model=\"camembert-base\")\n",
52 | "results = camembert_fill_mask(\"Le camembert est :)\")"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "from transformers import CamembertTokenizer, CamembertForMaskedLM\n",
62 | "\n",
63 | "tokenizer = CamembertTokenizer.from_pretrained(\"camembert-base\")\n",
64 | "model = CamembertForMaskedLM.from_pretrained(\"camembert-base\")"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
74 | "\n",
75 | "tokenizer = AutoTokenizer.from_pretrained(\"camembert-base\")\n",
76 | "model = AutoModelForMaskedLM.from_pretrained(\"camembert-base\")"
77 | ]
78 | }
79 | ],
80 | "metadata": {
81 | "colab": {
82 | "name": "Using pretrained models (PyTorch)",
83 | "provenance": []
84 | }
85 | },
86 | "nbformat": 4,
87 | "nbformat_minor": 4
88 | }
89 |
--------------------------------------------------------------------------------
/course/chapter4/section2_tf.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Using pretrained models (TensorFlow)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "data": {
33 | "text/plain": [
34 | "[\n",
35 | " {'sequence': 'Le camembert est délicieux :)', 'score': 0.49091005325317383, 'token': 7200, 'token_str': 'délicieux'}, \n",
36 | " {'sequence': 'Le camembert est excellent :)', 'score': 0.1055697426199913, 'token': 2183, 'token_str': 'excellent'}, \n",
37 | " {'sequence': 'Le camembert est succulent :)', 'score': 0.03453313186764717, 'token': 26202, 'token_str': 'succulent'}, \n",
38 | " {'sequence': 'Le camembert est meilleur :)', 'score': 0.0330314114689827, 'token': 528, 'token_str': 'meilleur'}, \n",
39 | " {'sequence': 'Le camembert est parfait :)', 'score': 0.03007650189101696, 'token': 1654, 'token_str': 'parfait'}\n",
40 | "]"
41 | ]
42 | },
43 | "execution_count": null,
44 | "metadata": {},
45 | "output_type": "execute_result"
46 | }
47 | ],
48 | "source": [
49 | "from transformers import pipeline\n",
50 | "\n",
51 | "camembert_fill_mask = pipeline(\"fill-mask\", model=\"camembert-base\")\n",
52 | "results = camembert_fill_mask(\"Le camembert est :)\")"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "from transformers import CamembertTokenizer, TFCamembertForMaskedLM\n",
62 | "\n",
63 | "tokenizer = CamembertTokenizer.from_pretrained(\"camembert-base\")\n",
64 | "model = TFCamembertForMaskedLM.from_pretrained(\"camembert-base\")"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "from transformers import AutoTokenizer, TFAutoModelForMaskedLM\n",
74 | "\n",
75 | "tokenizer = AutoTokenizer.from_pretrained(\"camembert-base\")\n",
76 | "model = TFAutoModelForMaskedLM.from_pretrained(\"camembert-base\")"
77 | ]
78 | }
79 | ],
80 | "metadata": {
81 | "colab": {
82 | "name": "Using pretrained models (TensorFlow)",
83 | "provenance": []
84 | }
85 | },
86 | "nbformat": 4,
87 | "nbformat_minor": 4
88 | }
89 |
--------------------------------------------------------------------------------
/course/videos/offset_mapping.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from transformers import AutoTokenizer\n",
61 | "\n",
62 | "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
63 | "print(tokenizer(\"Let's talk about tokenizers superpowers.\")[\"input_ids\"])\n",
64 | "print(tokenizer(\"Let's talk about tokenizers superpowers.\")[\"input_ids\"])"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "encoding = tokenizer(\"Let's talk about tokenizers superpowers.\")\n",
74 | "print(encoding.tokens())\n",
75 | "print(encoding.word_ids())"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "encoding = tokenizer(\n",
85 | " \"Let's talk about tokenizers superpowers.\",\n",
86 | " return_offsets_mapping=True\n",
87 | ")\n",
88 | "print(encoding.tokens())\n",
89 | "print(encoding[\"offset_mapping\"])"
90 | ]
91 | }
92 | ],
93 | "metadata": {
94 | "colab": {
95 | "name": "Fast tokenizer superpowers",
96 | "provenance": []
97 | }
98 | },
99 | "nbformat": 4,
100 | "nbformat_minor": 4
101 | }
102 |
--------------------------------------------------------------------------------
/course/videos/domain_adaptation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from transformers import pipeline"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "model_checkpoint = \"distilbert-base-uncased\"\n",
70 | "fill_masker = pipeline(\"fill-mask\", model=model_checkpoint)\n",
71 | "fill_masker(\"This is a great [MASK].\")"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "model_checkpoint = \"huggingface-course/distilbert-base-uncased-finetuned-imdb\"\n",
81 | "fill_masker = pipeline(\"fill-mask\", model=model_checkpoint)\n",
82 | "fill_masker(\"This is a great [MASK].\")"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n",
92 | "translator = pipeline(\"translation\", model=model_checkpoint)\n",
93 | "translator(\"This plugin automatically translates emails.\")"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "model_checkpoint = \"huggingface-course/marian-finetuned-kde4-en-to-fr\")\n",
103 | "translator = pipeline(\"translation\", model=model_checkpoint)\n",
104 | "translator(\"This plugin automatically translates emails.\")"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": []
113 | }
114 | ],
115 | "metadata": {
116 | "colab": {
117 | "name": "What is domain adaptation?",
118 | "provenance": []
119 | }
120 | },
121 | "nbformat": 4,
122 | "nbformat_minor": 4
123 | }
124 |
--------------------------------------------------------------------------------
/course/chapter2/section3_pt.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Models (PyTorch)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "from transformers import BertConfig, BertModel\n",
33 | "\n",
34 | "# Building the config\n",
35 | "config = BertConfig()\n",
36 | "\n",
37 | "# Building the model from the config\n",
38 | "model = BertModel(config)"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "BertConfig {\n",
50 | " [...]\n",
51 | " \"hidden_size\": 768,\n",
52 | " \"intermediate_size\": 3072,\n",
53 | " \"max_position_embeddings\": 512,\n",
54 | " \"num_attention_heads\": 12,\n",
55 | " \"num_hidden_layers\": 12,\n",
56 | " [...]\n",
57 | "}"
58 | ]
59 | },
60 | "execution_count": null,
61 | "metadata": {},
62 | "output_type": "execute_result"
63 | }
64 | ],
65 | "source": [
66 | "print(config)"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "from transformers import BertConfig, BertModel\n",
76 | "\n",
77 | "config = BertConfig()\n",
78 | "model = BertModel(config)\n",
79 | "\n",
80 | "# Model is randomly initialized!"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "from transformers import BertModel\n",
90 | "\n",
91 | "model = BertModel.from_pretrained(\"bert-base-cased\")"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "model.save_pretrained(\"directory_on_my_computer\")"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "sequences = [\"Hello!\", \"Cool.\", \"Nice!\"]"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "encoded_sequences = [\n",
119 | " [101, 7592, 999, 102],\n",
120 | " [101, 4658, 1012, 102],\n",
121 | " [101, 3835, 999, 102],\n",
122 | "]"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "import torch\n",
132 | "\n",
133 | "model_inputs = torch.tensor(encoded_sequences)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "output = model(model_inputs)"
143 | ]
144 | }
145 | ],
146 | "metadata": {
147 | "colab": {
148 | "name": "Models (PyTorch)",
149 | "provenance": []
150 | }
151 | },
152 | "nbformat": 4,
153 | "nbformat_minor": 4
154 | }
155 |
--------------------------------------------------------------------------------
/course/chapter6/section4.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Normalization and pre-tokenization"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "data": {
33 | "text/plain": [
34 | ""
35 | ]
36 | },
37 | "execution_count": null,
38 | "metadata": {},
39 | "output_type": "execute_result"
40 | }
41 | ],
42 | "source": [
43 | "from transformers import AutoTokenizer\n",
44 | "\n",
45 | "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
46 | "print(type(tokenizer.backend_tokenizer))"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {},
53 | "outputs": [
54 | {
55 | "data": {
56 | "text/plain": [
57 | "'hello how are u?'"
58 | ]
59 | },
60 | "execution_count": null,
61 | "metadata": {},
62 | "output_type": "execute_result"
63 | }
64 | ],
65 | "source": [
66 | "print(tokenizer.backend_tokenizer.normalizer.normalize_str(\"Héllò hôw are ü?\"))"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [
74 | {
75 | "data": {
76 | "text/plain": [
77 | "[('Hello', (0, 5)), (',', (5, 6)), ('how', (7, 10)), ('are', (11, 14)), ('you', (16, 19)), ('?', (19, 20))]"
78 | ]
79 | },
80 | "execution_count": null,
81 | "metadata": {},
82 | "output_type": "execute_result"
83 | }
84 | ],
85 | "source": [
86 | "tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Hello, how are you?\")"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [
94 | {
95 | "data": {
96 | "text/plain": [
97 | "[('Hello', (0, 5)), (',', (5, 6)), ('Ġhow', (6, 10)), ('Ġare', (10, 14)), ('Ġ', (14, 15)), ('Ġyou', (15, 19)),\n",
98 | " ('?', (19, 20))]"
99 | ]
100 | },
101 | "execution_count": null,
102 | "metadata": {},
103 | "output_type": "execute_result"
104 | }
105 | ],
106 | "source": [
107 | "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
108 | "tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Hello, how are you?\")"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [
116 | {
117 | "data": {
118 | "text/plain": [
119 | "[('▁Hello,', (0, 6)), ('▁how', (7, 10)), ('▁are', (11, 14)), ('▁you?', (16, 20))]"
120 | ]
121 | },
122 | "execution_count": null,
123 | "metadata": {},
124 | "output_type": "execute_result"
125 | }
126 | ],
127 | "source": [
128 | "tokenizer = AutoTokenizer.from_pretrained(\"t5-small\")\n",
129 | "tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Hello, how are you?\")"
130 | ]
131 | }
132 | ],
133 | "metadata": {
134 | "colab": {
135 | "name": "Normalization and pre-tokenization",
136 | "provenance": []
137 | }
138 | },
139 | "nbformat": 4,
140 | "nbformat_minor": 4
141 | }
142 |
--------------------------------------------------------------------------------
/course/chapter2/section3_tf.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Models (TensorFlow)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "from transformers import BertConfig, TFBertModel\n",
33 | "\n",
34 | "# Building the config\n",
35 | "config = BertConfig()\n",
36 | "\n",
37 | "# Building the model from the config\n",
38 | "model = TFBertModel(config)"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "BertConfig {\n",
50 | " [...]\n",
51 | " \"hidden_size\": 768,\n",
52 | " \"intermediate_size\": 3072,\n",
53 | " \"max_position_embeddings\": 512,\n",
54 | " \"num_attention_heads\": 12,\n",
55 | " \"num_hidden_layers\": 12,\n",
56 | " [...]\n",
57 | "}"
58 | ]
59 | },
60 | "execution_count": null,
61 | "metadata": {},
62 | "output_type": "execute_result"
63 | }
64 | ],
65 | "source": [
66 | "print(config)"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "from transformers import BertConfig, TFBertModel\n",
76 | "\n",
77 | "config = BertConfig()\n",
78 | "model = TFBertModel(config)\n",
79 | "\n",
80 | "# Model is randomly initialized!"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "from transformers import TFBertModel\n",
90 | "\n",
91 | "model = TFBertModel.from_pretrained(\"bert-base-cased\")"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "model.save_pretrained(\"directory_on_my_computer\")"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "sequences = [\"Hello!\", \"Cool.\", \"Nice!\"]"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "encoded_sequences = [\n",
119 | " [101, 7592, 999, 102],\n",
120 | " [101, 4658, 1012, 102],\n",
121 | " [101, 3835, 999, 102],\n",
122 | "]"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "import tensorflow as tf\n",
132 | "\n",
133 | "model_inputs = tf.constant(encoded_sequences)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "output = model(model_inputs)"
143 | ]
144 | }
145 | ],
146 | "metadata": {
147 | "colab": {
148 | "name": "Models (TensorFlow)",
149 | "provenance": []
150 | }
151 | },
152 | "nbformat": 4,
153 | "nbformat_minor": 4
154 | }
155 |
--------------------------------------------------------------------------------
/sagemaker/01_getting_started_pytorch/scripts/train.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
2 | from sklearn.metrics import accuracy_score, precision_recall_fscore_support
3 | from datasets import load_from_disk
4 | import random
5 | import logging
6 | import sys
7 | import argparse
8 | import os
9 | import torch
10 |
11 | if __name__ == "__main__":
12 |
13 | parser = argparse.ArgumentParser()
14 |
15 | # hyperparameters sent by the client are passed as command-line arguments to the script.
16 | parser.add_argument("--epochs", type=int, default=3)
17 | parser.add_argument("--train_batch_size", type=int, default=32)
18 | parser.add_argument("--eval_batch_size", type=int, default=64)
19 | parser.add_argument("--warmup_steps", type=int, default=500)
20 | parser.add_argument("--model_name", type=str)
21 | parser.add_argument("--learning_rate", type=str, default=5e-5)
22 |
23 | # Data, model, and output directories
24 | parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
25 | parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
26 | parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
27 | parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
28 | parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
29 |
30 | args, _ = parser.parse_known_args()
31 |
32 | # Set up logging
33 | logger = logging.getLogger(__name__)
34 |
35 | logging.basicConfig(
36 | level=logging.getLevelName("INFO"),
37 | handlers=[logging.StreamHandler(sys.stdout)],
38 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
39 | )
40 |
41 | # load datasets
42 | train_dataset = load_from_disk(args.training_dir)
43 | test_dataset = load_from_disk(args.test_dir)
44 |
45 | logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
46 | logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
47 |
48 | # compute metrics function for binary classification
49 | def compute_metrics(pred):
50 | labels = pred.label_ids
51 | preds = pred.predictions.argmax(-1)
52 | precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
53 | acc = accuracy_score(labels, preds)
54 | return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
55 |
56 | # download model from model hub
57 | model = AutoModelForSequenceClassification.from_pretrained(args.model_name)
58 | tokenizer = AutoTokenizer.from_pretrained(args.model_name)
59 |
60 | # define training args
61 | training_args = TrainingArguments(
62 | output_dir=args.model_dir,
63 | num_train_epochs=args.epochs,
64 | per_device_train_batch_size=args.train_batch_size,
65 | per_device_eval_batch_size=args.eval_batch_size,
66 | warmup_steps=args.warmup_steps,
67 | evaluation_strategy="epoch",
68 | logging_dir=f"{args.output_data_dir}/logs",
69 | learning_rate=float(args.learning_rate),
70 | )
71 |
72 | # create Trainer instance
73 | trainer = Trainer(
74 | model=model,
75 | args=training_args,
76 | compute_metrics=compute_metrics,
77 | train_dataset=train_dataset,
78 | eval_dataset=test_dataset,
79 | tokenizer=tokenizer,
80 | )
81 |
82 | # train model
83 | trainer.train()
84 |
85 | # evaluate model
86 | eval_result = trainer.evaluate(eval_dataset=test_dataset)
87 |
88 | # writes eval result to file which can be accessed later in s3 ouput
89 | with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer:
90 | print(f"***** Eval results *****")
91 | for key, value in sorted(eval_result.items()):
92 | writer.write(f"{key} = {value}\n")
93 |
94 | # Saves the model to s3
95 | trainer.save_model(args.model_dir)
96 |
--------------------------------------------------------------------------------
/sagemaker/06_sagemaker_metrics/scripts/train.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import os
4 | import random
5 | import sys
6 |
7 | from datasets import load_from_disk
8 | from sklearn.metrics import accuracy_score, precision_recall_fscore_support
9 | import torch
10 | from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
11 |
12 |
13 | if __name__ == "__main__":
14 |
15 | parser = argparse.ArgumentParser()
16 |
17 | # hyperparameters sent by the client are passed as command-line arguments to the script.
18 | parser.add_argument("--epochs", type=int, default=3)
19 | parser.add_argument("--train_batch_size", type=int, default=32)
20 | parser.add_argument("--eval_batch_size", type=int, default=64)
21 | parser.add_argument("--warmup_steps", type=int, default=500)
22 | parser.add_argument("--model_name", type=str)
23 | parser.add_argument("--learning_rate", type=float, default=5e-5)
24 |
25 | # Data, model, and output directories
26 | parser.add_argument("--checkpoints", type=str, default="/opt/ml/checkpoints/")
27 | parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
28 | parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
29 | parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
30 | parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
31 |
32 | args, _ = parser.parse_known_args()
33 |
34 | # Set up logging
35 | logger = logging.getLogger(__name__)
36 |
37 | logging.basicConfig(
38 | level=logging.getLevelName("INFO"),
39 | handlers=[logging.StreamHandler(sys.stdout)],
40 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
41 | )
42 |
43 | # load datasets
44 | train_dataset = load_from_disk(args.training_dir)
45 | test_dataset = load_from_disk(args.test_dir)
46 |
47 | logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
48 | logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
49 |
50 | # compute metrics function for binary classification
51 | def compute_metrics(pred):
52 | labels = pred.label_ids
53 | preds = pred.predictions.argmax(-1)
54 | precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
55 | acc = accuracy_score(labels, preds)
56 | return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
57 |
58 | # download model from model hub
59 | model = AutoModelForSequenceClassification.from_pretrained(args.model_name)
60 | tokenizer = AutoTokenizer.from_pretrained(args.model_name)
61 |
62 | # define training args
63 | training_args = TrainingArguments(
64 | output_dir=args.checkpoints,
65 | num_train_epochs=args.epochs,
66 | per_device_train_batch_size=args.train_batch_size,
67 | per_device_eval_batch_size=args.eval_batch_size,
68 | warmup_steps=args.warmup_steps,
69 | evaluation_strategy="epoch",
70 | logging_dir=f"{args.checkpoints}/logs",
71 | learning_rate=args.learning_rate,
72 | )
73 |
74 | # create Trainer instance
75 | trainer = Trainer(
76 | model=model,
77 | args=training_args,
78 | compute_metrics=compute_metrics,
79 | train_dataset=train_dataset,
80 | eval_dataset=test_dataset,
81 | tokenizer=tokenizer,
82 | )
83 |
84 | # train model
85 | trainer.train()
86 |
87 | # evaluate model
88 | eval_result = trainer.evaluate(eval_dataset=test_dataset)
89 |
90 | # writes eval result to file which can be accessed later in s3 ouput
91 | with open(os.path.join(args.checkpoints, "eval_results.txt"), "w") as writer:
92 | print(f"***** Eval results *****")
93 | for key, value in sorted(eval_result.items()):
94 | writer.write(f"{key} = {value}\n")
95 |
96 | # Saves the model locally. In SageMaker, writing in /opt/ml/model sends it to S3
97 | trainer.save_model(args.model_dir)
98 |
--------------------------------------------------------------------------------
/course/videos/bleu_metric.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from datasets import load_metric\n",
61 | "\n",
62 | "bleu = load_metric(\"bleu\")\n",
63 | "predictions = [[\"I\", \"have\", \"thirty\", \"six\", \"years\"]]\n",
64 | "references = [\n",
65 | " [[\"I\", \"am\", \"thirty\", \"six\", \"years\", \"old\"], [\"I\", \"am\", \"thirty\", \"six\"]]\n",
66 | "]\n",
67 | "bleu.compute(predictions=predictions, references=references)"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "predictions = [[\"I\", \"have\", \"thirty\", \"six\", \"years\"]]\n",
77 | "references = [\n",
78 | " [[\"I\", \"am\", \"thirty\", \"six\", \"years\", \"old\"], [\"I\", \"am\", \"thirty\", \"six\"]]\n",
79 | "]\n",
80 | "bleu.compute(predictions=predictions, references=references)"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "predictions = [[\"I\", \"have\", \"thirty\", \"six\", \"years\"]]\n",
90 | "references = [\n",
91 | " [[\"I\", \"am\", \"thirty\", \"six\", \"years\", \"old\"], [\"I\", \"am\", \"thirty\", \"six\"]]\n",
92 | "]\n",
93 | "bleu.compute(predictions=predictions, references=references)"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "! pip install sacrebleu"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "sacrebleu = load_metric(\"sacrebleu\")\n",
112 | "# SacreBLEU operates on raw text, not tokens\n",
113 | "predictions = [\"I have thirty six years\"]\n",
114 | "references = [[\"I am thirty six years old\", \"I am thirty six\"]]\n",
115 | "sacrebleu.compute(predictions=predictions, references=references)"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": []
124 | }
125 | ],
126 | "metadata": {
127 | "colab": {
128 | "name": "What is the BLEU metric?",
129 | "provenance": []
130 | }
131 | },
132 | "nbformat": 4,
133 | "nbformat_minor": 4
134 | }
135 |
--------------------------------------------------------------------------------
/course/videos/datasets_and_dataframes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from datasets import load_dataset\n",
61 | "\n",
62 | "dataset = load_dataset(\"swiss_judgment_prediction\", \"all_languages\", split=\"train\")\n",
63 | "dataset[0]"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "# Convert the output format to pandas.DataFrame\n",
73 | "dataset.set_format(\"pandas\")\n",
74 | "dataset[0]"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "dataset.__getitem__(0)\n",
84 | "\n",
85 | "dataset.set_format(\"pandas\")\n",
86 | "\n",
87 | "dataset.__getitem__(0)"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "df = dataset.to_pandas()\n",
97 | "df.head()"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "# How are languages distributed across regions?\n",
107 | "df.groupby(\"region\")[\"language\"].value_counts()\n",
108 | "\n",
109 | "# Which legal area is most common?\n",
110 | "df[\"legal area\"].value_counts()"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "from transformers import AutoTokenizer\n",
120 | "\n",
121 | "# Load a pretrained tokenizer\n",
122 | "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
123 | "# Tokenize the `text` column\n",
124 | "dataset.map(lambda x : tokenizer(x[\"text\"]))"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "# Reset back to Arrow format\n",
134 | "dataset.reset_format()\n",
135 | "# Now we can tokenize!\n",
136 | "dataset.map(lambda x : tokenizer(x[\"text\"]))"
137 | ]
138 | }
139 | ],
140 | "metadata": {
141 | "colab": {
142 | "name": "Datasets + DataFrames = ❤️",
143 | "provenance": []
144 | }
145 | },
146 | "nbformat": 4,
147 | "nbformat_minor": 4
148 | }
149 |
--------------------------------------------------------------------------------
/course/videos/fast_tokenizers.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from datasets import load_dataset\n",
61 | "\n",
62 | "raw_datasets = load_dataset(\"glue\", \"mnli\")\n",
63 | "raw_datasets"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "from transformers import AutoTokenizer\n",
73 | "\n",
74 | "fast_tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
75 | "\n",
76 | "def tokenize_with_fast(examples):\n",
77 | " return fast_tokenizer(\n",
78 | " examples[\"premise\"], examples[\"hypothesis\"], truncation=True\n",
79 | " )"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "slow_tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\", use_fast=False)\n",
89 | "\n",
90 | "def tokenize_with_slow(examples):\n",
91 | " return fast_tokenizer(\n",
92 | " examples[\"premise\"], examples[\"hypothesis\"], truncation=True\n",
93 | " )"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "%time tokenized_datasets = raw_datasets.map(tokenize_with_fast)"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "%time tokenized_datasets = raw_datasets.map(tokenize_with_slow)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "%time tokenized_datasets = raw_datasets.map(tokenize_with_fast, batched=True)"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "%time tokenized_datasets = raw_datasets.map(tokenize_with_slow, batched=True)"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": []
138 | }
139 | ],
140 | "metadata": {
141 | "colab": {
142 | "name": "Why are fast tokenizers called fast?",
143 | "provenance": []
144 | }
145 | },
146 | "nbformat": 4,
147 | "nbformat_minor": 4
148 | }
149 |
--------------------------------------------------------------------------------
/sagemaker/05_spot_instances/scripts/train.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
2 | from transformers.trainer_utils import get_last_checkpoint
3 |
4 | from sklearn.metrics import accuracy_score, precision_recall_fscore_support
5 | from datasets import load_from_disk
6 | import logging
7 | import sys
8 | import argparse
9 | import os
10 |
11 | # Set up logging
12 | logger = logging.getLogger(__name__)
13 |
14 | logging.basicConfig(
15 | level=logging.getLevelName("INFO"),
16 | handlers=[logging.StreamHandler(sys.stdout)],
17 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
18 | )
19 |
20 | if __name__ == "__main__":
21 |
22 | logger.info(sys.argv)
23 |
24 | parser = argparse.ArgumentParser()
25 |
26 | # hyperparameters sent by the client are passed as command-line arguments to the script.
27 | parser.add_argument("--epochs", type=int, default=3)
28 | parser.add_argument("--train_batch_size", type=int, default=32)
29 | parser.add_argument("--eval_batch_size", type=int, default=64)
30 | parser.add_argument("--warmup_steps", type=int, default=500)
31 | parser.add_argument("--model_name", type=str)
32 | parser.add_argument("--learning_rate", type=str, default=5e-5)
33 | parser.add_argument("--output_dir", type=str)
34 |
35 | # Data, model, and output directories
36 | parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
37 | parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
38 | parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
39 | parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
40 | parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
41 |
42 | args, _ = parser.parse_known_args()
43 |
44 | # load datasets
45 | train_dataset = load_from_disk(args.training_dir)
46 | test_dataset = load_from_disk(args.test_dir)
47 |
48 | logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
49 | logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
50 |
51 | # compute metrics function for binary classification
52 | def compute_metrics(pred):
53 | labels = pred.label_ids
54 | preds = pred.predictions.argmax(-1)
55 | precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
56 | acc = accuracy_score(labels, preds)
57 | return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
58 |
59 | # download model from model hub
60 | model = AutoModelForSequenceClassification.from_pretrained(args.model_name)
61 | tokenizer = AutoTokenizer.from_pretrained(args.model_name)
62 |
63 | # define training args
64 | training_args = TrainingArguments(
65 | output_dir=args.output_dir,
66 | num_train_epochs=args.epochs,
67 | per_device_train_batch_size=args.train_batch_size,
68 | per_device_eval_batch_size=args.eval_batch_size,
69 | warmup_steps=args.warmup_steps,
70 | evaluation_strategy="epoch",
71 | logging_dir=f"{args.output_data_dir}/logs",
72 | learning_rate=float(args.learning_rate),
73 | )
74 |
75 | # create Trainer instance
76 | trainer = Trainer(
77 | model=model,
78 | args=training_args,
79 | compute_metrics=compute_metrics,
80 | train_dataset=train_dataset,
81 | eval_dataset=test_dataset,
82 | tokenizer=tokenizer,
83 | )
84 |
85 | # train model
86 | if get_last_checkpoint(args.output_dir) is not None:
87 | logger.info("***** continue training *****")
88 | last_checkpoint = get_last_checkpoint(args.output_dir)
89 | trainer.train(resume_from_checkpoint=last_checkpoint)
90 | else:
91 | trainer.train()
92 | # evaluate model
93 | eval_result = trainer.evaluate(eval_dataset=test_dataset)
94 |
95 | # writes eval result to file which can be accessed later in s3 ouput
96 | with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer:
97 | print(f"***** Eval results *****")
98 | for key, value in sorted(eval_result.items()):
99 | writer.write(f"{key} = {value}\n")
100 |
101 | # Saves the model to s3
102 | trainer.save_model(args.model_dir)
103 |
--------------------------------------------------------------------------------
/course/videos/debug_error.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from transformers import pipeline\n",
61 | "\n",
62 | "model_checkpoint = \"distillbert-base-cased-distilled-squad\"\n",
63 | "question_answerer = pipeline(\"question_answering\", model=model_checkpoint)\n",
64 | "\n",
65 | "context = \"\"\"\n",
66 | "🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.\n",
67 | "\"\"\"\n",
68 | "question = \"Which deep learning libraries back 🤗 Transformers?\"\n",
69 | "question_answerer(question=question, context=context)"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "from transformers import pipeline\n",
79 | "\n",
80 | "model_checkpoint = \"distillbert-base-cased-distilled-squad\"\n",
81 | "question_answerer = pipeline(\"question-answering\", model=model_checkpoint)\n",
82 | "\n",
83 | "context = \"\"\"\n",
84 | "🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.\n",
85 | "\"\"\"\n",
86 | "question = \"Which deep learning libraries back 🤗 Transformers?\"\n",
87 | "question_answerer(question=question, context=context)"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "from transformers import pipeline\n",
97 | "\n",
98 | "model_checkpoint = \"distilbert-base-cased-distilled-squad\"\n",
99 | "question_answerer = pipeline(\"question-answering\", model=model_checkpoint)\n",
100 | "\n",
101 | "context = \"\"\"\n",
102 | "🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.\n",
103 | "\"\"\"\n",
104 | "question = \"Which deep learning libraries back 🤗 Transformers?\"\n",
105 | "question_answerer(question=question, context=context)"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": []
114 | }
115 | ],
116 | "metadata": {
117 | "colab": {
118 | "name": "What to do when you get an error?",
119 | "provenance": []
120 | }
121 | },
122 | "nbformat": 4,
123 | "nbformat_minor": 4
124 | }
125 |
--------------------------------------------------------------------------------
/course/videos/summarization_processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from datasets import load_dataset, load_metric\n",
61 | "\n",
62 | "raw_datasets = load_dataset(\"xsum\")\n",
63 | "raw_datasets = raw_datasets.remove_columns([\"id\"])\n",
64 | "raw_datasets[\"train\"]"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "print(raw_datasets[\"train\"][1])"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "from transformers import AutoTokenizer\n",
83 | "\n",
84 | "model_checkpoint = \"t5-small\"\n",
85 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
86 | "\n",
87 | "sample = raw_datasets[\"train\"][1]\n",
88 | "inputs = tokenizer(sample[\"document\"])\n",
89 | "with tokenizer.as_target_tokenizer():\n",
90 | " targets = tokenizer(sample[\"summary\"])\n",
91 | "\n",
92 | "print(tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"]))\n",
93 | "print(tokenizer.convert_ids_to_tokens(targets[\"input_ids\"])"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "max_input_length = 1024\n",
103 | "max_target_length = 128\n",
104 | "\n",
105 | "def preprocess_function(examples):\n",
106 | " model_inputs = tokenizer(examples[\"document\"], max_length=max_input_length, truncation=True)\n",
107 | "\n",
108 | " # Setup the tokenizer for targets\n",
109 | " with tokenizer.as_target_tokenizer():\n",
110 | " labels = tokenizer(examples[\"summary\"], max_length=max_target_length, truncation=True)\n",
111 | "\n",
112 | " model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
113 | " return model_inputs\n",
114 | "\n",
115 | "tokenized_datasets = raw_datasets.map(\n",
116 | " preprocess_function, batched=True, remove_columns=[\"document\", \"summary\"]\n",
117 | ")"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "from transformers import DataCollatorForSeq2Seq\n",
127 | "\n",
128 | "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": []
137 | }
138 | ],
139 | "metadata": {
140 | "colab": {
141 | "name": "Data processing for Summarization",
142 | "provenance": []
143 | }
144 | },
145 | "nbformat": 4,
146 | "nbformat_minor": 4
147 | }
148 |
--------------------------------------------------------------------------------
/course/chapter2/section4_pt.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Tokenizers (PyTorch)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "data": {
33 | "text/plain": [
34 | "['Jim', 'Henson', 'was', 'a', 'puppeteer']"
35 | ]
36 | },
37 | "execution_count": null,
38 | "metadata": {},
39 | "output_type": "execute_result"
40 | }
41 | ],
42 | "source": [
43 | "tokenized_text = \"Jim Henson was a puppeteer\".split()\n",
44 | "print(tokenized_text)"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "from transformers import BertTokenizer\n",
54 | "\n",
55 | "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "from transformers import AutoTokenizer\n",
65 | "\n",
66 | "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [
74 | {
75 | "data": {
76 | "text/plain": [
77 | "{'input_ids': [101, 7993, 170, 11303, 1200, 2443, 1110, 3014, 102],\n",
78 | " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
79 | " 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}"
80 | ]
81 | },
82 | "execution_count": null,
83 | "metadata": {},
84 | "output_type": "execute_result"
85 | }
86 | ],
87 | "source": [
88 | "tokenizer(\"Using a Transformer network is simple\")"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "tokenizer.save_pretrained(\"directory_on_my_computer\")"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "data": {
107 | "text/plain": [
108 | "['Using', 'a', 'transform', '##er', 'network', 'is', 'simple']"
109 | ]
110 | },
111 | "execution_count": null,
112 | "metadata": {},
113 | "output_type": "execute_result"
114 | }
115 | ],
116 | "source": [
117 | "from transformers import AutoTokenizer\n",
118 | "\n",
119 | "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
120 | "\n",
121 | "sequence = \"Using a Transformer network is simple\"\n",
122 | "tokens = tokenizer.tokenize(sequence)\n",
123 | "\n",
124 | "print(tokens)"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [
132 | {
133 | "data": {
134 | "text/plain": [
135 | "[7993, 170, 11303, 1200, 2443, 1110, 3014]"
136 | ]
137 | },
138 | "execution_count": null,
139 | "metadata": {},
140 | "output_type": "execute_result"
141 | }
142 | ],
143 | "source": [
144 | "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
145 | "\n",
146 | "print(ids)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [
154 | {
155 | "data": {
156 | "text/plain": [
157 | "'Using a Transformer network is simple'"
158 | ]
159 | },
160 | "execution_count": null,
161 | "metadata": {},
162 | "output_type": "execute_result"
163 | }
164 | ],
165 | "source": [
166 | "decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])\n",
167 | "print(decoded_string)"
168 | ]
169 | }
170 | ],
171 | "metadata": {
172 | "colab": {
173 | "name": "Tokenizers (PyTorch)",
174 | "provenance": []
175 | }
176 | },
177 | "nbformat": 4,
178 | "nbformat_minor": 4
179 | }
180 |
--------------------------------------------------------------------------------
/course/chapter2/section4_tf.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Tokenizers (TensorFlow)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "data": {
33 | "text/plain": [
34 | "['Jim', 'Henson', 'was', 'a', 'puppeteer']"
35 | ]
36 | },
37 | "execution_count": null,
38 | "metadata": {},
39 | "output_type": "execute_result"
40 | }
41 | ],
42 | "source": [
43 | "tokenized_text = \"Jim Henson was a puppeteer\".split()\n",
44 | "print(tokenized_text)"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "from transformers import BertTokenizer\n",
54 | "\n",
55 | "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "from transformers import AutoTokenizer\n",
65 | "\n",
66 | "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [
74 | {
75 | "data": {
76 | "text/plain": [
77 | "{'input_ids': [101, 7993, 170, 11303, 1200, 2443, 1110, 3014, 102],\n",
78 | " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
79 | " 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}"
80 | ]
81 | },
82 | "execution_count": null,
83 | "metadata": {},
84 | "output_type": "execute_result"
85 | }
86 | ],
87 | "source": [
88 | "tokenizer(\"Using a Transformer network is simple\")"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "tokenizer.save_pretrained(\"directory_on_my_computer\")"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "data": {
107 | "text/plain": [
108 | "['Using', 'a', 'transform', '##er', 'network', 'is', 'simple']"
109 | ]
110 | },
111 | "execution_count": null,
112 | "metadata": {},
113 | "output_type": "execute_result"
114 | }
115 | ],
116 | "source": [
117 | "from transformers import AutoTokenizer\n",
118 | "\n",
119 | "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
120 | "\n",
121 | "sequence = \"Using a Transformer network is simple\"\n",
122 | "tokens = tokenizer.tokenize(sequence)\n",
123 | "\n",
124 | "print(tokens)"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [
132 | {
133 | "data": {
134 | "text/plain": [
135 | "[7993, 170, 11303, 1200, 2443, 1110, 3014]"
136 | ]
137 | },
138 | "execution_count": null,
139 | "metadata": {},
140 | "output_type": "execute_result"
141 | }
142 | ],
143 | "source": [
144 | "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
145 | "\n",
146 | "print(ids)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [
154 | {
155 | "data": {
156 | "text/plain": [
157 | "'Using a Transformer network is simple'"
158 | ]
159 | },
160 | "execution_count": null,
161 | "metadata": {},
162 | "output_type": "execute_result"
163 | }
164 | ],
165 | "source": [
166 | "decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])\n",
167 | "print(decoded_string)"
168 | ]
169 | }
170 | ],
171 | "metadata": {
172 | "colab": {
173 | "name": "Tokenizers (TensorFlow)",
174 | "provenance": []
175 | }
176 | },
177 | "nbformat": 4,
178 | "nbformat_minor": 4
179 | }
180 |
--------------------------------------------------------------------------------
/course/videos/clm_processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
61 | "from datasets import load_dataset, DatasetDict\n",
62 | "\n",
63 | "ds_train = load_dataset(\"huggingface-course/codeparrot-ds-train\", split=\"train\")\n",
64 | "ds_valid = load_dataset(\"huggingface-course/codeparrot-ds-valid\", split=\"train\")\n",
65 | "\n",
66 | "raw_datasets = DatasetDict(\n",
67 | " {\n",
68 | " \"train\": ds_train,\n",
69 | " \"valid\": ds_valid,\n",
70 | " }\n",
71 | ")\n",
72 | "\n",
73 | "tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/code-search-net-tokenizer\")\n",
74 | "model = AutoModelForCausalLM.from_pretrained(\"huggingface-course/codeparrot-ds\")\n",
75 | "batch = tokenizer([\"import numpy as np\"], return_tensors=\"pt\")\n",
76 | "\n",
77 | "text = \"import numpy as np\\n\"*20\n",
78 | "context_length = 128"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "outputs = tokenizer(\n",
88 | " text,\n",
89 | " truncation=True,\n",
90 | " max_length=16,\n",
91 | " return_overflowing_tokens=True,\n",
92 | " return_length=True,\n",
93 | " )\n",
94 | "\n",
95 | "print(f\"Input chunk lengths: {(outputs['length'])}\")"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "def tokenize(element):\n",
105 | " outputs = tokenizer(\n",
106 | " element[\"content\"],\n",
107 | " truncation=True,\n",
108 | " max_length=context_length,\n",
109 | " return_overflowing_tokens=True,\n",
110 | " return_length=True,\n",
111 | " )\n",
112 | " input_batch = []\n",
113 | " for length, input_ids in zip(outputs[\"length\"], outputs[\"input_ids\"]):\n",
114 | " if length == context_length:\n",
115 | " input_batch.append(input_ids)\n",
116 | " return {\"input_ids\": input_batch}\n",
117 | "\n",
118 | "\n",
119 | "tokenized_datasets = raw_datasets.map(\n",
120 | " tokenize, batched=True, remove_columns=raw_datasets[\"train\"].column_names\n",
121 | ")"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "output = model(input_ids=batch[\"input_ids\"], labels=batch[\"input_ids\"])\n",
131 | "loss = output.loss"
132 | ]
133 | }
134 | ],
135 | "metadata": {
136 | "colab": {
137 | "name": "Data processing for Causal Language Modeling",
138 | "provenance": []
139 | }
140 | },
141 | "nbformat": 4,
142 | "nbformat_minor": 4
143 | }
144 |
--------------------------------------------------------------------------------
/course/videos/load_custom_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "from datasets import load_dataset\n",
70 | "\n",
71 | "local_csv_dataset = load_dataset(\"csv\", data_files=\"winequality-white.csv\", sep=\";\")\n",
72 | "local_csv_dataset[\"train\"]"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "# Load the dataset from the URL directly\n",
82 | "dataset_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\"\n",
83 | "remote_csv_dataset = load_dataset(\"csv\", data_files=dataset_url, sep=\";\")\n",
84 | "remote_csv_dataset"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "dataset_url = \"https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt\"\n",
94 | "text_dataset = load_dataset(\"text\", data_files=dataset_url)\n",
95 | "text_dataset[\"train\"][:5]"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "dataset_url = \"https://raw.githubusercontent.com/hirupert/sede/main/data/sede/train.jsonl\"\n",
105 | "json_lines_dataset = load_dataset(\"json\", data_files=dataset_url)\n",
106 | "json_lines_dataset[\"train\"][:2]"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "dataset_url = \"https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json\"\n",
116 | "json_dataset = load_dataset(\"json\", data_files=dataset_url, field=\"data\")\n",
117 | "json_dataset"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "url = \"https://rajpurkar.github.io/SQuAD-explorer/dataset/\"\n",
127 | "data_files = {\"train\": f\"{url}train-v2.0.json\", \"validation\": f\"{url}dev-v2.0.json\"}\n",
128 | "json_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")\n",
129 | "json_dataset"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": []
138 | }
139 | ],
140 | "metadata": {
141 | "colab": {
142 | "name": "Loading a custom dataset",
143 | "provenance": []
144 | }
145 | },
146 | "nbformat": 4,
147 | "nbformat_minor": 4
148 | }
149 |
--------------------------------------------------------------------------------
/course/videos/debug_training_tf.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from datasets import load_dataset, load_metric\n",
61 | "from transformers import (\n",
62 | " AutoTokenizer,\n",
63 | " TFAutoModelForSequenceClassification,\n",
64 | ")\n",
65 | "\n",
66 | "raw_datasets = load_dataset(\"glue\", \"mnli\")\n",
67 | "\n",
68 | "model_checkpoint = \"distilbert-base-uncased\"\n",
69 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
70 | "\n",
71 | "\n",
72 | "def preprocess_function(examples):\n",
73 | " return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True)\n",
74 | "\n",
75 | "\n",
76 | "tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)\n",
77 | "model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "train_dataset = tokenized_datasets[\"train\"].to_tf_dataset(\n",
87 | " columns=[\"input_ids\", \"labels\"], batch_size=16, shuffle=True\n",
88 | ")\n",
89 | "\n",
90 | "validation_dataset = tokenized_datasets[\"validation_matched\"].to_tf_dataset(\n",
91 | " columns=[\"input_ids\", \"labels\"], batch_size=16, shuffle=True\n",
92 | ")\n",
93 | "\n",
94 | "model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)\n",
95 | "\n",
96 | "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer='adam')\n",
97 | "\n",
98 | "model.fit(train_dataset)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "for batch in train_dataset:\n",
108 | " break"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "model.compile(optimizer='adam')"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "model = TFAutoModelForSequenceClassification.from_pretrained(\n",
127 | " model_checkpoint,\n",
128 | " num_labels=3\n",
129 | ")"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "model.compile(optimizer='adam')"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": []
147 | }
148 | ],
149 | "metadata": {
150 | "colab": {
151 | "name": "Debugging the Training Pipeline (TensorFlow)",
152 | "provenance": []
153 | }
154 | },
155 | "nbformat": 4,
156 | "nbformat_minor": 4
157 | }
158 |
--------------------------------------------------------------------------------
/course/chapter8/section3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Asking for help on the forums"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "from transformers import AutoTokenizer, AutoModel\n",
33 | "\n",
34 | "model_checkpoint = \"distilbert-base-uncased\"\n",
35 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
36 | "model = AutoModel.from_pretrained(model_checkpoint)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "text = \"\"\"\n",
46 | "Generation One is a retroactive term for the Transformers characters that\n",
47 | "appeared between 1984 and 1993. The Transformers began with the 1980s Japanese\n",
48 | "toy lines Micro Change and Diaclone. They presented robots able to transform\n",
49 | "into everyday vehicles, electronic items or weapons. Hasbro bought the Micro\n",
50 | "Change and Diaclone toys, and partnered with Takara. Marvel Comics was hired by\n",
51 | "Hasbro to create the backstory; editor-in-chief Jim Shooter wrote an overall\n",
52 | "story, and gave the task of creating the characthers to writer Dennis O'Neil.\n",
53 | "Unhappy with O'Neil's work (although O'Neil created the name \"Optimus Prime\"),\n",
54 | "Shooter chose Bob Budiansky to create the characters.\n",
55 | "\n",
56 | "The Transformers mecha were largely designed by Shōji Kawamori, the creator of\n",
57 | "the Japanese mecha anime franchise Macross (which was adapted into the Robotech\n",
58 | "franchise in North America). Kawamori came up with the idea of transforming\n",
59 | "mechs while working on the Diaclone and Macross franchises in the early 1980s\n",
60 | "(such as the VF-1 Valkyrie in Macross and Robotech), with his Diaclone mechs\n",
61 | "later providing the basis for Transformers.\n",
62 | "\n",
63 | "The primary concept of Generation One is that the heroic Optimus Prime, the\n",
64 | "villainous Megatron, and their finest soldiers crash land on pre-historic Earth\n",
65 | "in the Ark and the Nemesis before awakening in 1985, Cybertron hurtling through\n",
66 | "the Neutral zone as an effect of the war. The Marvel comic was originally part\n",
67 | "of the main Marvel Universe, with appearances from Spider-Man and Nick Fury,\n",
68 | "plus some cameos, as well as a visit to the Savage Land.\n",
69 | "\n",
70 | "The Transformers TV series began around the same time. Produced by Sunbow\n",
71 | "Productions and Marvel Productions, later Hasbro Productions, from the start it\n",
72 | "contradicted Budiansky's backstories. The TV series shows the Autobots looking\n",
73 | "for new energy sources, and crash landing as the Decepticons attack. Marvel\n",
74 | "interpreted the Autobots as destroying a rogue asteroid approaching Cybertron.\n",
75 | "Shockwave is loyal to Megatron in the TV series, keeping Cybertron in a\n",
76 | "stalemate during his absence, but in the comic book he attempts to take command\n",
77 | "of the Decepticons. The TV series would also differ wildly from the origins\n",
78 | "Budiansky had created for the Dinobots, the Decepticon turned Autobot Jetfire\n",
79 | "(known as Skyfire on TV), the Constructicons (who combine to form\n",
80 | "Devastator),[19][20] and Omega Supreme. The Marvel comic establishes early on\n",
81 | "that Prime wields the Creation Matrix, which gives life to machines. In the\n",
82 | "second season, the two-part episode The Key to Vector Sigma introduced the\n",
83 | "ancient Vector Sigma computer, which served the same original purpose as the\n",
84 | "Creation Matrix (giving life to Transformers), and its guardian Alpha Trion.\n",
85 | "\"\"\"\n",
86 | "\n",
87 | "inputs = tokenizer(text, return_tensors=\"pt\")\n",
88 | "logits = model(**inputs).logits"
89 | ]
90 | }
91 | ],
92 | "metadata": {
93 | "colab": {
94 | "name": "Asking for help on the forums",
95 | "provenance": []
96 | }
97 | },
98 | "nbformat": 4,
99 | "nbformat_minor": 4
100 | }
101 |
--------------------------------------------------------------------------------
/sagemaker/02_getting_started_tensorflow/scripts/train.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import os
4 | import sys
5 |
6 | import tensorflow as tf
7 | from datasets import load_dataset
8 | from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding, create_optimizer
9 |
10 |
11 | if __name__ == "__main__":
12 |
13 | parser = argparse.ArgumentParser()
14 |
15 | # Hyperparameters sent by the client are passed as command-line arguments to the script.
16 | parser.add_argument("--epochs", type=int, default=3)
17 | parser.add_argument("--train_batch_size", type=int, default=16)
18 | parser.add_argument("--eval_batch_size", type=int, default=8)
19 | parser.add_argument("--model_id", type=str)
20 | parser.add_argument("--learning_rate", type=str, default=3e-5)
21 |
22 | # Data, model, and output directories
23 | parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
24 | parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
25 | parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
26 |
27 | args, _ = parser.parse_known_args()
28 |
29 | # Set up logging
30 | logger = logging.getLogger(__name__)
31 |
32 | logging.basicConfig(
33 | level=logging.getLevelName("INFO"),
34 | handlers=[logging.StreamHandler(sys.stdout)],
35 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
36 | )
37 |
38 | # Load tokenizer
39 | tokenizer = AutoTokenizer.from_pretrained(args.model_id)
40 |
41 | # Load DatasetDict
42 | dataset = load_dataset("imdb")
43 |
44 | # Preprocess train dataset
45 | def preprocess_function(examples):
46 | return tokenizer(examples["text"], truncation=True)
47 |
48 | encoded_dataset = dataset.map(preprocess_function, batched=True)
49 |
50 | # define tokenizer_columns
51 | # tokenizer_columns is the list of keys from the dataset that get passed to the TensorFlow model
52 | tokenizer_columns = ["attention_mask", "input_ids"]
53 |
54 | # convert to TF datasets
55 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
56 | encoded_dataset["train"] = encoded_dataset["train"].rename_column("label", "labels")
57 | tf_train_dataset = encoded_dataset["train"].to_tf_dataset(
58 | columns=tokenizer_columns,
59 | label_cols=["labels"],
60 | shuffle=True,
61 | batch_size=8,
62 | collate_fn=data_collator,
63 | )
64 | encoded_dataset["test"] = encoded_dataset["test"].rename_column("label", "labels")
65 | tf_validation_dataset = encoded_dataset["test"].to_tf_dataset(
66 | columns=tokenizer_columns,
67 | label_cols=["labels"],
68 | shuffle=False,
69 | batch_size=8,
70 | collate_fn=data_collator,
71 | )
72 |
73 | # Prepare model labels - useful in inference API
74 | labels = encoded_dataset["train"].features["labels"].names
75 | num_labels = len(labels)
76 | label2id, id2label = dict(), dict()
77 | for i, label in enumerate(labels):
78 | label2id[label] = str(i)
79 | id2label[str(i)] = label
80 |
81 | # download model from model hub
82 | model = TFAutoModelForSequenceClassification.from_pretrained(
83 | args.model_id, num_labels=num_labels, label2id=label2id, id2label=id2label
84 | )
85 |
86 | # create Adam optimizer with learning rate scheduling
87 | batches_per_epoch = len(encoded_dataset["train"]) // args.train_batch_size
88 | total_train_steps = int(batches_per_epoch * args.epochs)
89 |
90 | optimizer, _ = create_optimizer(init_lr=args.learning_rate, num_warmup_steps=0, num_train_steps=total_train_steps)
91 | loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
92 |
93 | # define metric and compile model
94 | metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
95 | model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
96 |
97 | # Training
98 | logger.info("*** Train ***")
99 | train_results = model.fit(
100 | tf_train_dataset,
101 | epochs=args.epochs,
102 | validation_data=tf_validation_dataset,
103 | )
104 |
105 | output_eval_file = os.path.join(args.output_data_dir, "train_results.txt")
106 |
107 | with open(output_eval_file, "w") as writer:
108 | logger.info("***** Train results *****")
109 | logger.info(train_results)
110 | for key, value in train_results.history.items():
111 | logger.info(" %s = %s", key, value)
112 | writer.write("%s = %s\n" % (key, value))
113 |
114 | # Save result
115 | model.save_pretrained(args.model_dir)
116 | tokenizer.save_pretrained(args.model_dir)
117 |
--------------------------------------------------------------------------------
/course/videos/save_load_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from datasets import load_dataset\n",
61 | "\n",
62 | "raw_datasets = load_dataset(\"allocine\")\n",
63 | "raw_datasets.cache_files"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "raw_datasets.save_to_disk(\"my-arrow-datasets\")"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "from datasets import load_from_disk\n",
82 | "\n",
83 | "arrow_datasets_reloaded = load_from_disk(\"my-arrow-datasets\")\n",
84 | "arrow_datasets_reloaded"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "for split, dataset in raw_datasets.items():\n",
94 | " dataset.to_csv(f\"my-dataset-{split}.csv\", index=None)"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "data_files = {\n",
104 | " \"train\": \"my-dataset-train.csv\",\n",
105 | " \"validation\": \"my-dataset-validation.csv\",\n",
106 | " \"test\": \"my-dataset-test.csv\",\n",
107 | "}\n",
108 | "\n",
109 | "csv_datasets_reloaded = load_dataset(\"csv\", data_files=data_files)\n",
110 | "csv_datasets_reloaded"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "# Save in JSON Lines format\n",
120 | "for split, dataset in raw_datasets.items():\n",
121 | " dataset.to_json(f\"my-dataset-{split}.jsonl\")\n",
122 | "\n",
123 | "# Save in Parquet format\n",
124 | "for split, dataset in raw_datasets.items():\n",
125 | " dataset.to_parquet(f\"my-dataset-{split}.parquet\")"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "json_data_files = {\n",
135 | " \"train\": \"my-dataset-train.jsonl\",\n",
136 | " \"validation\": \"my-dataset-validation.jsonl\",\n",
137 | " \"test\": \"my-dataset-test.jsonl\",\n",
138 | "}\n",
139 | "\n",
140 | "parquet_data_files = {\n",
141 | " \"train\": \"my-dataset-train.parquet\",\n",
142 | " \"validation\": \"my-dataset-validation.parquet\",\n",
143 | " \"test\": \"my-dataset-test.parquet\",\n",
144 | "}\n",
145 | "\n",
146 | "# Reload with the `json` script\n",
147 | "json_datasets_reloaded = load_dataset(\"json\", data_files=json_data_files)\n",
148 | "# Reload with the `parquet` script\n",
149 | "parquet_datasets_reloaded = load_dataset(\"parquet\", data_files=parquet_data_files)"
150 | ]
151 | }
152 | ],
153 | "metadata": {
154 | "colab": {
155 | "name": "Saving and reloading a dataset",
156 | "provenance": []
157 | }
158 | },
159 | "nbformat": 4,
160 | "nbformat_minor": 4
161 | }
162 |
--------------------------------------------------------------------------------
/sagemaker/15_training_compiler/scripts/train.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import os
4 | import random
5 | import sys
6 |
7 | import numpy as np
8 | import torch
9 | from datasets import load_from_disk, load_metric
10 | from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
11 | from transformers.trainer_utils import get_last_checkpoint
12 |
13 | if __name__ == "__main__":
14 |
15 | parser = argparse.ArgumentParser()
16 |
17 | # hyperparameters sent by the client are passed as command-line arguments to the script.
18 | parser.add_argument("--epochs", type=int, default=3)
19 | parser.add_argument("--train_batch_size", type=int, default=32)
20 | parser.add_argument("--eval_batch_size", type=int, default=64)
21 | parser.add_argument("--warmup_steps", type=int, default=500)
22 | parser.add_argument("--model_id", type=str)
23 | parser.add_argument("--learning_rate", type=str, default=5e-5)
24 | parser.add_argument("--fp16", type=bool, default=True)
25 |
26 | # Data, model, and output directories
27 | parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
28 | parser.add_argument("--output_dir", type=str, default=os.environ["SM_MODEL_DIR"])
29 | parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
30 | parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
31 | parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
32 |
33 | args, _ = parser.parse_known_args()
34 |
35 | # is needed for Amazon SageMaker Training Compiler
36 | os.environ["GPU_NUM_DEVICES"] = args.n_gpus
37 |
38 | # Set up logging
39 | logger = logging.getLogger(__name__)
40 |
41 | logging.basicConfig(
42 | level=logging.getLevelName("INFO"),
43 | handlers=[logging.StreamHandler(sys.stdout)],
44 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
45 | )
46 |
47 | # load datasets
48 | train_dataset = load_from_disk(args.training_dir)
49 | test_dataset = load_from_disk(args.test_dir)
50 |
51 | logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
52 | logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
53 |
54 | # define metrics and metrics function
55 | metric = load_metric("accuracy")
56 |
57 | def compute_metrics(eval_pred):
58 | predictions, labels = eval_pred
59 | predictions = np.argmax(predictions, axis=1)
60 | return metric.compute(predictions=predictions, references=labels)
61 |
62 | # Prepare model labels - useful in inference API
63 | labels = train_dataset.features["labels"].names
64 | num_labels = len(labels)
65 | label2id, id2label = dict(), dict()
66 | for i, label in enumerate(labels):
67 | label2id[label] = str(i)
68 | id2label[str(i)] = label
69 |
70 | # download model from model hub
71 | model = AutoModelForSequenceClassification.from_pretrained(
72 | args.model_id, num_labels=num_labels, label2id=label2id, id2label=id2label
73 | )
74 | tokenizer = AutoTokenizer.from_pretrained(args.model_id)
75 |
76 | # define training args
77 | training_args = TrainingArguments(
78 | output_dir=args.output_dir,
79 | overwrite_output_dir=True if get_last_checkpoint(args.output_dir) is not None else False,
80 | num_train_epochs=args.epochs,
81 | per_device_train_batch_size=args.train_batch_size,
82 | per_device_eval_batch_size=args.eval_batch_size,
83 | warmup_steps=args.warmup_steps,
84 | fp16=args.fp16,
85 | evaluation_strategy="epoch",
86 | save_strategy="epoch",
87 | save_total_limit=2,
88 | logging_dir=f"{args.output_data_dir}/logs",
89 | learning_rate=float(args.learning_rate),
90 | load_best_model_at_end=True,
91 | metric_for_best_model="accuracy",
92 | disable_tqdm=True,
93 | )
94 |
95 | # create Trainer instance
96 | trainer = Trainer(
97 | model=model,
98 | args=training_args,
99 | compute_metrics=compute_metrics,
100 | train_dataset=train_dataset,
101 | eval_dataset=test_dataset,
102 | tokenizer=tokenizer,
103 | )
104 |
105 | # train model
106 | trainer.train()
107 |
108 | # evaluate model
109 | eval_result = trainer.evaluate(eval_dataset=test_dataset)
110 |
111 | # writes eval result to file which can be accessed later in s3 ouput
112 | with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer:
113 | print(f"***** Eval results *****")
114 | for key, value in sorted(eval_result.items()):
115 | writer.write(f"{key} = {value}\n")
116 | print(f"{key} = {value}\n")
117 |
118 | # Saves the model to s3 uses os.environ["SM_MODEL_DIR"] to make sure checkpointing works
119 | trainer.save_model(os.environ["SM_MODEL_DIR"])
120 |
--------------------------------------------------------------------------------
/course/videos/mlm_processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from datasets import load_dataset\n",
61 | "\n",
62 | "raw_datasets = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\")\n",
63 | "raw_datasets[\"train\"]"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "from datasets import load_dataset\n",
73 | "from transformers import AutoTokenizer\n",
74 | "\n",
75 | "raw_datasets = load_dataset(\"imdb\")\n",
76 | "raw_datasets = raw_datasets.remove_columns(\"label\")\n",
77 | "\n",
78 | "model_checkpoint = \"distilbert-base-cased\"\n",
79 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
80 | "context_length = 128\n",
81 | "\n",
82 | "def tokenize_pad_and_truncate(texts):\n",
83 | " return tokenizer(texts[\"text\"], truncation=True, padding=\"max_length\", max_length=context_length)\n",
84 | "\n",
85 | "tokenized_datasets = raw_datasets.map(tokenize_pad_and_truncate, batched=True)"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "def tokenize_and_chunk(texts):\n",
95 | " return tokenizer(\n",
96 | " texts[\"text\"], truncation=True, max_length=context_length,\n",
97 | " return_overflowing_tokens=True\n",
98 | " )\n",
99 | "\n",
100 | "tokenized_datasets = raw_datasets.map(\n",
101 | " tokenize_and_chunk, batched=True, remove_columns=[\"text\"]\n",
102 | ")\n",
103 | "\n",
104 | "len(raw_datasets[\"train\"]), len(tokenized_datasets[\"train\"])"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "def tokenize_and_chunk(texts):\n",
114 | " all_input_ids = []\n",
115 | " for input_ids in tokenizer(texts[\"text\"])[\"input_ids\"]:\n",
116 | " all_input_ids.extend(input_ids)\n",
117 | " all_input_ids.append(tokenizer.eos_token_id)\n",
118 | " \n",
119 | " chunks = []\n",
120 | " for idx in range(0, len(all_input_ids), context_length):\n",
121 | " chunks.append(all_input_ids[idx: idx + context_length])\n",
122 | " return {\"input_ids\": chunks}\n",
123 | "\n",
124 | "tokenized_datasets = raw_datasets.map(tokenize_and_chunk, batched=True, remove_columns=[\"text\"])\n",
125 | "\n",
126 | "len(raw_datasets[\"train\"]), len(tokenized_datasets[\"train\"])"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "from transformers import DataCollatorForLanguageModeling\n",
136 | "\n",
137 | "data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": []
146 | }
147 | ],
148 | "metadata": {
149 | "colab": {
150 | "name": "Data processing for Masked Language Modeling",
151 | "provenance": []
152 | }
153 | },
154 | "nbformat": 4,
155 | "nbformat_minor": 4
156 | }
157 |
--------------------------------------------------------------------------------
/course/videos/building_tokenizer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from datasets import load_dataset\n",
61 | "\n",
62 | "dataset = load_dataset(\"wikitext\", name=\"wikitext-2-raw-v1\", split=\"train\")\n",
63 | "\n",
64 | "\n",
65 | "def get_training_corpus():\n",
66 | " for i in range(0, len(dataset), 1000):\n",
67 | " yield dataset[i : i + 1000][\"text\"]"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "tokenizer = Tokenizer(models.WordPiece(unk_token=\"[UNK]\"))"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "tokenizer.normalizer = normalizers.Sequence(\n",
95 | " [\n",
96 | " normalizers.Replace(Regex(r\"[\\p{Other}&&[^\\n\\t\\r]]\"), \"\"),\n",
97 | " normalizers.Replace(Regex(r\"[\\s]\"), \" \"),\n",
98 | " normalizers.Lowercase(),\n",
99 | " normalizers.NFD(), normalizers.StripAccents()]\n",
100 | ")"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "special_tokens = [\"[UNK]\", \"[PAD]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"]\n",
119 | "trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "cls_token_id = tokenizer.token_to_id(\"[CLS]\")\n",
138 | "sep_token_id = tokenizer.token_to_id(\"[SEP]\")\n",
139 | "tokenizer.post_processor = processors.TemplateProcessing(\n",
140 | " single=f\"[CLS]:0 $A:0 [SEP]:0\",\n",
141 | " pair=f\"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1\",\n",
142 | " special_tokens=[(\"[CLS]\", cls_token_id), (\"[SEP]\", sep_token_id)],\n",
143 | ")"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "tokenizer.decoder = decoders.WordPiece(prefix=\"##\")"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": []
161 | }
162 | ],
163 | "metadata": {
164 | "colab": {
165 | "name": "Building a new tokenizer",
166 | "provenance": []
167 | }
168 | },
169 | "nbformat": 4,
170 | "nbformat_minor": 4
171 | }
172 |
--------------------------------------------------------------------------------
/course/videos/memory_mapping_streaming.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from datasets import load_dataset\n",
61 | "\n",
62 | "data_files = \"https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst\"\n",
63 | "large_dataset = load_dataset(\"json\", data_files=data_files, split=\"train\")\n",
64 | "size_gb = large_dataset.dataset_size / (1024 ** 3)\n",
65 | "print(f\"Dataset size (cache file) : {size_gb:.2f} GB\")"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "import psutil\n",
75 | "\n",
76 | "# Process.memory_info is expressed in bytes, so convert to megabytes\n",
77 | "print(f\"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB\")"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "import timeit\n",
87 | "\n",
88 | "code_snippet = \"\"\"batch_size = 1000\n",
89 | "\n",
90 | "for idx in range(0, len(large_dataset), batch_size):\n",
91 | " _ = large_dataset[idx:idx + batch_size]\n",
92 | "\"\"\"\n",
93 | "\n",
94 | "time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())\n",
95 | "print(\n",
96 | " f\"Iterated over {len(large_dataset)} examples (about {size_gb:.1f} GB) in \"\n",
97 | " f\"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s\"\n",
98 | ")"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "large_dataset_streamed = load_dataset(\n",
108 | " \"json\", data_files=data_files, split=\"train\", streaming=True)\n",
109 | "\n",
110 | "next(iter(large_dataset_streamed))"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "type(large_dataset_streamed)"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "from transformers import AutoTokenizer\n",
129 | "\n",
130 | "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
131 | "tokenized_dataset = large_dataset_streamed.map(lambda x: tokenizer(x[\"text\"]))\n",
132 | "next(iter(tokenized_dataset))"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "# Select the first 5 examples \n",
142 | "dataset_head = large_dataset_streamed.take(5)\n",
143 | "list(dataset_head)"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "# Skip the first 1,000 examples and include the rest in the training set\n",
153 | "train_dataset = large_dataset_streamed.skip(1000)\n",
154 | "# Take the first 1,000 examples for the validation set\n",
155 | "validation_dataset = large_dataset_streamed.take(1000)"
156 | ]
157 | }
158 | ],
159 | "metadata": {
160 | "colab": {
161 | "name": "Memory Mapping & streaming",
162 | "provenance": []
163 | }
164 | },
165 | "nbformat": 4,
166 | "nbformat_minor": 4
167 | }
168 |
--------------------------------------------------------------------------------
/course/videos/train_new_tokenizer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from transformers import BertTokenizerFast\n",
61 | "\n",
62 | "tokenizer = BertTokenizerFast.from_pretrained(\n",
63 | " 'huggingface-course/bert-base-uncased-tokenizer-without-normalizer'\n",
64 | ")"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "text = \"here is a sentence adapted to our tokenizer\"\n",
74 | "print(tokenizer.tokenize(text))"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "text = \"এই বাক্যটি আমাদের টোকেনাইজারের উপযুক্ত নয়\"\n",
84 | "print(tokenizer.tokenize(text))"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "text = \"this tokenizer does not know àccënts and CAPITAL LETTERS\"\n",
94 | "print(tokenizer.tokenize(text))"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "text = \"the medical vocabulary is divided into many sub-token: paracetamol, phrayngitis\"\n",
104 | "print(tokenizer.tokenize(text))"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "from datasets import load_dataset\n",
114 | "\n",
115 | "raw_datasets = load_dataset(\"code_search_net\", \"python\")"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "def get_training_corpus():\n",
125 | " dataset = raw_datasets[\"train\"]\n",
126 | " for start_idx in range(0, len(dataset), 1000):\n",
127 | " samples = dataset[start_idx : start_idx + 1000]\n",
128 | " yield samples[\"whole_func_string\"]"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "from transformers import AutoTokenizer\n",
138 | "\n",
139 | "old_tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
140 | "training_corpus = get_training_corpus()\n",
141 | "new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)\n",
142 | "new_tokenizer.save_pretrained(\"code-search-net-tokenizer\")"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "example = \"\"\"class LinearLayer():\n",
152 | " def __init__(self, input_size, output_size):\n",
153 | " self.weight = torch.randn(input_size, output_size)\n",
154 | " self.bias = torch.zeros(output_size)\n",
155 | "\n",
156 | " def __call__(self, x):\n",
157 | " return x @ self.weights + self.bias\n",
158 | " \"\"\"\n",
159 | "\n",
160 | "print(old_tokenizer.tokenize(example))\n",
161 | "print(new_tokenizer.tokenize(example))"
162 | ]
163 | }
164 | ],
165 | "metadata": {
166 | "colab": {
167 | "name": "Training a new tokenizer",
168 | "provenance": []
169 | }
170 | },
171 | "nbformat": 4,
172 | "nbformat_minor": 4
173 | }
174 |
--------------------------------------------------------------------------------
/course/chapter5/section2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# What if my dataset isn't on the Hub?"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz\n",
33 | "!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "!gzip -dkv SQuAD_it-*.json.gz"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "from datasets import load_dataset\n",
52 | "\n",
53 | "squad_it_dataset = load_dataset(\"json\", data_files=\"SQuAD_it-train.json\", field=\"data\")"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "data": {
63 | "text/plain": [
64 | "DatasetDict({\n",
65 | " train: Dataset({\n",
66 | " features: ['title', 'paragraphs'],\n",
67 | " num_rows: 442\n",
68 | " })\n",
69 | "})"
70 | ]
71 | },
72 | "execution_count": null,
73 | "metadata": {},
74 | "output_type": "execute_result"
75 | }
76 | ],
77 | "source": [
78 | "squad_it_dataset"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "data": {
88 | "text/plain": [
89 | "{\n",
90 | " \"title\": \"Terremoto del Sichuan del 2008\",\n",
91 | " \"paragraphs\": [\n",
92 | " {\n",
93 | " \"context\": \"Il terremoto del Sichuan del 2008 o il terremoto...\",\n",
94 | " \"qas\": [\n",
95 | " {\n",
96 | " \"answers\": [{\"answer_start\": 29, \"text\": \"2008\"}],\n",
97 | " \"id\": \"56cdca7862d2951400fa6826\",\n",
98 | " \"question\": \"In quale anno si è verificato il terremoto nel Sichuan?\",\n",
99 | " },\n",
100 | " ...\n",
101 | " ],\n",
102 | " },\n",
103 | " ...\n",
104 | " ],\n",
105 | "}"
106 | ]
107 | },
108 | "execution_count": null,
109 | "metadata": {},
110 | "output_type": "execute_result"
111 | }
112 | ],
113 | "source": [
114 | "squad_it_dataset[\"train\"][0]"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "data": {
124 | "text/plain": [
125 | "DatasetDict({\n",
126 | " train: Dataset({\n",
127 | " features: ['title', 'paragraphs'],\n",
128 | " num_rows: 442\n",
129 | " })\n",
130 | " test: Dataset({\n",
131 | " features: ['title', 'paragraphs'],\n",
132 | " num_rows: 48\n",
133 | " })\n",
134 | "})"
135 | ]
136 | },
137 | "execution_count": null,
138 | "metadata": {},
139 | "output_type": "execute_result"
140 | }
141 | ],
142 | "source": [
143 | "data_files = {\"train\": \"SQuAD_it-train.json\", \"test\": \"SQuAD_it-test.json\"}\n",
144 | "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")\n",
145 | "squad_it_dataset"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "data_files = {\"train\": \"SQuAD_it-train.json.gz\", \"test\": \"SQuAD_it-test.json.gz\"}\n",
155 | "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "url = \"https://github.com/crux82/squad-it/raw/master/\"\n",
165 | "data_files = {\n",
166 | " \"train\": url + \"SQuAD_it-train.json.gz\",\n",
167 | " \"test\": url + \"SQuAD_it-test.json.gz\",\n",
168 | "}\n",
169 | "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")"
170 | ]
171 | }
172 | ],
173 | "metadata": {
174 | "colab": {
175 | "name": "What if my dataset isn't on the Hub?",
176 | "provenance": []
177 | }
178 | },
179 | "nbformat": 4,
180 | "nbformat_minor": 4
181 | }
182 |
--------------------------------------------------------------------------------
/course/videos/slice_and_dice.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from datasets import load_dataset\n",
61 | "\n",
62 | "squad = load_dataset(\"squad\", split=\"train\")\n",
63 | "squad[0]"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "squad_shuffled = squad.shuffle(seed=666)\n",
73 | "squad_shuffled[0]"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "dataset = squad.train_test_split(test_size=0.1)\n",
83 | "dataset"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "indices = [0, 10, 20, 40, 80]\n",
93 | "examples = squad.select(indices)\n",
94 | "examples"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "sample = squad.shuffle().select(range(5))\n",
104 | "sample"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "squad_filtered = squad.filter(lambda x : x[\"title\"].startswith(\"L\"))\n",
114 | "squad_filtered[0]"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "squad.rename_column(\"context\", \"passages\")"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "squad.remove_columns([\"id\", \"title\"])"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "squad"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "squad.flatten()"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "def lowercase_title(example):\n",
160 | " return {\"title\": example[\"title\"].lower()}\n",
161 | "\n",
162 | "squad_lowercase = squad.map(lowercase_title)\n",
163 | "# Peek at random sample\n",
164 | "squad_lowercase.shuffle(seed=42)[\"title\"][:5]"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "from transformers import AutoTokenizer\n",
174 | "\n",
175 | "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
176 | "\n",
177 | "def tokenize_title(example):\n",
178 | " return tokenizer(example[\"title\"])\n",
179 | "\n",
180 | "squad.map(tokenize_title, batched=True, batch_size=500)"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": []
189 | }
190 | ],
191 | "metadata": {
192 | "colab": {
193 | "name": "Slide and dice a dataset 🔪",
194 | "provenance": []
195 | }
196 | },
197 | "nbformat": 4,
198 | "nbformat_minor": 4
199 | }
200 |
--------------------------------------------------------------------------------
/course/videos/custom_loss.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
61 | "from accelerate import Accelerator\n",
62 | "\n",
63 | "accelerator = Accelerator()\n",
64 | "tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/code-search-net-tokenizer\")\n",
65 | "model = AutoModelForCausalLM.from_pretrained(\"huggingface-course/codeparrot-ds\")\n",
66 | "\n",
67 | "keytoken_ids = []\n",
68 | "for keyword in [\n",
69 | " \"plt\",\n",
70 | " \"pd\",\n",
71 | " \"sk\",\n",
72 | " \"fit\",\n",
73 | " \"predict\",\n",
74 | " \" plt\",\n",
75 | " \" pd\",\n",
76 | " \" sk\",\n",
77 | " \" fit\",\n",
78 | " \" predict\",\n",
79 | "]:\n",
80 | " ids = tokenizer([keyword]).input_ids[0]\n",
81 | " keytoken_ids.append(ids[0])\n",
82 | "\n",
83 | "batch = tokenizer([\"import numpy as np\"], return_tensors=\"pt\")\n",
84 | "model = accelerator.prepare(model)"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "from torch.nn import CrossEntropyLoss\n",
94 | "import torch\n",
95 | "\n",
96 | "\n",
97 | "def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):\n",
98 | " # Shift so that tokens < n predict n\n",
99 | " shift_labels = inputs[..., 1:].contiguous()\n",
100 | " shift_logits = logits[..., :-1, :].contiguous()\n",
101 | " # Calculate per-token loss\n",
102 | " loss_fct = CrossEntropyLoss(reduce=False)\n",
103 | " loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))\n",
104 | " # Resize and average loss per sample\n",
105 | " loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)\n",
106 | " # Calculate and scale weighting\n",
107 | " weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(\n",
108 | " axis=[0, 2]\n",
109 | " )\n",
110 | " weights = alpha * (1.0 + weights)\n",
111 | " # Calculate weighted average\n",
112 | " weighted_loss = (loss_per_sample * weights).mean()\n",
113 | " return weighted_loss"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "logits = model(batch[\"input_ids\"]).logits\n",
123 | "loss = keytoken_weighted_loss(batch[\"input_ids\"], logits, keytoken_ids)\n",
124 | "accelerator.backward(loss)"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "from transformers import Trainer\n",
134 | "\n",
135 | "class MyTrainer(Trainer):\n",
136 | " def compute_loss(self, model, inputs, return_outputs=False):\n",
137 | " input_ids = inputs.get(\"input_ids\")\n",
138 | " outputs = model(input_ids)\n",
139 | " loss = keytoken_weighted_loss(input_ids, outputs.logits, keytoken_ids)\n",
140 | "\n",
141 | " return (loss, outputs) if return_outputs else loss"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": []
150 | }
151 | ],
152 | "metadata": {
153 | "colab": {
154 | "name": "Using a custom loss function",
155 | "provenance": []
156 | }
157 | },
158 | "nbformat": 4,
159 | "nbformat_minor": 4
160 | }
161 |
--------------------------------------------------------------------------------
/course/chapter3/section3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Fine-tuning a model with the Trainer API"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "from datasets import load_dataset\n",
33 | "from transformers import AutoTokenizer, DataCollatorWithPadding\n",
34 | "\n",
35 | "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
36 | "checkpoint = \"bert-base-uncased\"\n",
37 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
38 | "\n",
39 | "\n",
40 | "def tokenize_function(example):\n",
41 | " return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n",
42 | "\n",
43 | "\n",
44 | "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
45 | "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "from transformers import TrainingArguments\n",
55 | "\n",
56 | "training_args = TrainingArguments(\"test-trainer\")"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "from transformers import AutoModelForSequenceClassification\n",
66 | "\n",
67 | "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "from transformers import Trainer\n",
77 | "\n",
78 | "trainer = Trainer(\n",
79 | " model,\n",
80 | " training_args,\n",
81 | " train_dataset=tokenized_datasets[\"train\"],\n",
82 | " eval_dataset=tokenized_datasets[\"validation\"],\n",
83 | " data_collator=data_collator,\n",
84 | " tokenizer=tokenizer,\n",
85 | ")"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "trainer.train()"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "data": {
104 | "text/plain": [
105 | "(408, 2) (408,)"
106 | ]
107 | },
108 | "execution_count": null,
109 | "metadata": {},
110 | "output_type": "execute_result"
111 | }
112 | ],
113 | "source": [
114 | "predictions = trainer.predict(tokenized_datasets[\"validation\"])\n",
115 | "print(predictions.predictions.shape, predictions.label_ids.shape)"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "import numpy as np\n",
125 | "\n",
126 | "preds = np.argmax(predictions.predictions, axis=-1)"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {},
133 | "outputs": [
134 | {
135 | "data": {
136 | "text/plain": [
137 | "{'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}"
138 | ]
139 | },
140 | "execution_count": null,
141 | "metadata": {},
142 | "output_type": "execute_result"
143 | }
144 | ],
145 | "source": [
146 | "from datasets import load_metric\n",
147 | "\n",
148 | "metric = load_metric(\"glue\", \"mrpc\")\n",
149 | "metric.compute(predictions=preds, references=predictions.label_ids)"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "def compute_metrics(eval_preds):\n",
159 | " metric = load_metric(\"glue\", \"mrpc\")\n",
160 | " logits, labels = eval_preds\n",
161 | " predictions = np.argmax(logits, axis=-1)\n",
162 | " return metric.compute(predictions=predictions, references=labels)"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "training_args = TrainingArguments(\"test-trainer\", evaluation_strategy=\"epoch\")\n",
172 | "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
173 | "\n",
174 | "trainer = Trainer(\n",
175 | " model,\n",
176 | " training_args,\n",
177 | " train_dataset=tokenized_datasets[\"train\"],\n",
178 | " eval_dataset=tokenized_datasets[\"validation\"],\n",
179 | " data_collator=data_collator,\n",
180 | " tokenizer=tokenizer,\n",
181 | " compute_metrics=compute_metrics,\n",
182 | ")"
183 | ]
184 | }
185 | ],
186 | "metadata": {
187 | "colab": {
188 | "name": "Fine-tuning a model with the Trainer API",
189 | "provenance": []
190 | }
191 | },
192 | "nbformat": 4,
193 | "nbformat_minor": 4
194 | }
195 |
--------------------------------------------------------------------------------
/sagemaker/14_train_and_push_to_hub/scripts/train.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import os
4 | import random
5 | import sys
6 |
7 | import numpy as np
8 | import torch
9 | from datasets import load_from_disk, load_metric
10 | from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
11 | from transformers.trainer_utils import get_last_checkpoint
12 |
13 | if __name__ == "__main__":
14 |
15 | parser = argparse.ArgumentParser()
16 |
17 | # hyperparameters sent by the client are passed as command-line arguments to the script.
18 | parser.add_argument("--epochs", type=int, default=3)
19 | parser.add_argument("--train_batch_size", type=int, default=32)
20 | parser.add_argument("--eval_batch_size", type=int, default=64)
21 | parser.add_argument("--warmup_steps", type=int, default=500)
22 | parser.add_argument("--model_id", type=str)
23 | parser.add_argument("--learning_rate", type=str, default=5e-5)
24 | parser.add_argument("--fp16", type=bool, default=True)
25 |
26 | # Push to Hub Parameters
27 | parser.add_argument("--push_to_hub", type=bool, default=True)
28 | parser.add_argument("--hub_model_id", type=str, default=None)
29 | parser.add_argument("--hub_strategy", type=str, default=None)
30 | parser.add_argument("--hub_token", type=str, default=None)
31 |
32 | # Data, model, and output directories
33 | parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
34 | parser.add_argument("--output_dir", type=str, default=os.environ["SM_MODEL_DIR"])
35 | parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
36 | parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
37 | parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
38 |
39 | args, _ = parser.parse_known_args()
40 |
41 | # make sure we have required parameters to push
42 | if args.push_to_hub:
43 | if args.hub_strategy is None:
44 | raise ValueError("--hub_strategy is required when pushing to Hub")
45 | if args.hub_token is None:
46 | raise ValueError("--hub_token is required when pushing to Hub")
47 |
48 | # sets hub id if not provided
49 | if args.hub_model_id is None:
50 | args.hub_model_id = args.model_id.replace("/", "--")
51 |
52 | # Set up logging
53 | logger = logging.getLogger(__name__)
54 |
55 | logging.basicConfig(
56 | level=logging.getLevelName("INFO"),
57 | handlers=[logging.StreamHandler(sys.stdout)],
58 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
59 | )
60 |
61 | # load datasets
62 | train_dataset = load_from_disk(args.training_dir)
63 | test_dataset = load_from_disk(args.test_dir)
64 |
65 | logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
66 | logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
67 |
68 | # define metrics and metrics function
69 | metric = load_metric("accuracy")
70 |
71 | def compute_metrics(eval_pred):
72 | predictions, labels = eval_pred
73 | predictions = np.argmax(predictions, axis=1)
74 | return metric.compute(predictions=predictions, references=labels)
75 |
76 | # Prepare model labels - useful in inference API
77 | labels = train_dataset.features["labels"].names
78 | num_labels = len(labels)
79 | label2id, id2label = dict(), dict()
80 | for i, label in enumerate(labels):
81 | label2id[label] = str(i)
82 | id2label[str(i)] = label
83 |
84 | # download model from model hub
85 | model = AutoModelForSequenceClassification.from_pretrained(
86 | args.model_id, num_labels=num_labels, label2id=label2id, id2label=id2label
87 | )
88 | tokenizer = AutoTokenizer.from_pretrained(args.model_id)
89 |
90 | # define training args
91 | training_args = TrainingArguments(
92 | output_dir=args.output_dir,
93 | overwrite_output_dir=True if get_last_checkpoint(args.output_dir) is not None else False,
94 | num_train_epochs=args.epochs,
95 | per_device_train_batch_size=args.train_batch_size,
96 | per_device_eval_batch_size=args.eval_batch_size,
97 | warmup_steps=args.warmup_steps,
98 | fp16=args.fp16,
99 | evaluation_strategy="epoch",
100 | save_strategy="epoch",
101 | save_total_limit=2,
102 | logging_dir=f"{args.output_data_dir}/logs",
103 | learning_rate=float(args.learning_rate),
104 | load_best_model_at_end=True,
105 | metric_for_best_model="accuracy",
106 | # push to hub parameters
107 | push_to_hub=args.push_to_hub,
108 | hub_strategy=args.hub_strategy,
109 | hub_model_id=args.hub_model_id,
110 | hub_token=args.hub_token,
111 | )
112 |
113 | # create Trainer instance
114 | trainer = Trainer(
115 | model=model,
116 | args=training_args,
117 | compute_metrics=compute_metrics,
118 | train_dataset=train_dataset,
119 | eval_dataset=test_dataset,
120 | tokenizer=tokenizer,
121 | )
122 |
123 | # train model
124 | trainer.train()
125 |
126 | # evaluate model
127 | eval_result = trainer.evaluate(eval_dataset=test_dataset)
128 |
129 | # save best model, metrics and create model card
130 | trainer.create_model_card(model_name=args.hub_model_id)
131 | trainer.push_to_hub()
132 |
133 | # Saves the model to s3 uses os.environ["SM_MODEL_DIR"] to make sure checkpointing works
134 | trainer.save_model(os.environ["SM_MODEL_DIR"])
135 |
--------------------------------------------------------------------------------
/course/videos/token_processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from datasets import load_dataset\n",
61 | "\n",
62 | "raw_datasets = load_dataset(\"conll2003\")\n",
63 | "raw_datasets = raw_datasets.remove_columns([\"chunk_tags\", \"id\", \"pos_tags\"])\n",
64 | "raw_datasets = raw_datasets.rename_column(\"ner_tags\", \"labels\")\n",
65 | "raw_datasets = raw_datasets.rename_column(\"tokens\", \"words\")\n",
66 | "raw_datasets[\"train\"]"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "print(raw_datasets[\"train\"][0][\"words\"])\n",
76 | "print(raw_datasets[\"train\"][0][\"labels\"])"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "label_names = raw_datasets[\"train\"].features[\"labels\"].feature.names\n",
86 | "label_names"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "from transformers import AutoTokenizer\n",
96 | "\n",
97 | "model_checkpoint = \"bert-base-cased\"\n",
98 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
99 | "\n",
100 | "inputs = tokenizer(raw_datasets[\"train\"][0][\"words\"], is_split_into_words=True)\n",
101 | "inputs.tokens()"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "def shift_label(label):\n",
111 | " # If the label is B-XXX we change it to I-XXX\n",
112 | " if label % 2 == 1:\n",
113 | " label += 1\n",
114 | " return label\n",
115 | "\n",
116 | "def align_labels_with_tokens(labels, word_ids):\n",
117 | " new_labels = []\n",
118 | " current_word = None\n",
119 | " for word_id in word_ids:\n",
120 | " if word_id is None:\n",
121 | " new_labels.append(-100)\n",
122 | " elif word_id != current_word:\n",
123 | " # Start of a new word!\n",
124 | " current_word = word_id\n",
125 | " new_labels.append(labels[word_id])\n",
126 | " else:\n",
127 | " new_labels.append(shift_label(labels[word_id]))\n",
128 | "\n",
129 | " return new_labels"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "def tokenize_and_align_labels(examples):\n",
139 | " tokenized_inputs = tokenizer(examples[\"words\"], truncation=True, is_split_into_words=True)\n",
140 | " new_labels = []\n",
141 | " for i, labels in enumerate(examples[\"labels\"]):\n",
142 | " word_ids = tokenized_inputs.word_ids(i)\n",
143 | " new_labels.append(align_labels_with_tokens(labels, word_ids))\n",
144 | "\n",
145 | " tokenized_inputs[\"labels\"] = new_labels\n",
146 | " return tokenized_inputs\n",
147 | "\n",
148 | "tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "from transformers import DataCollatorForTokenClassification\n",
158 | "\n",
159 | "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {},
166 | "outputs": [],
167 | "source": []
168 | }
169 | ],
170 | "metadata": {
171 | "colab": {
172 | "name": "Data processing for Token Classification",
173 | "provenance": []
174 | }
175 | },
176 | "nbformat": 4,
177 | "nbformat_minor": 4
178 | }
179 |
--------------------------------------------------------------------------------
/course/videos/translation_processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from datasets import load_dataset, load_metric\n",
61 | "\n",
62 | "raw_datasets = load_dataset(\"kde4\", lang1=\"en\", lang2=\"fr\")\n",
63 | "\n",
64 | "def extract_languages(examples):\n",
65 | " inputs = [ex[\"en\"] for ex in examples[\"translation\"]]\n",
66 | " targets = [ex[\"fr\"] for ex in examples[\"translation\"]]\n",
67 | " return {\"inputs\": inputs, \"targets\": targets}\n",
68 | "\n",
69 | "raw_datasets = raw_datasets.map(extract_languages, batched=True, remove_columns=[\"id\", \"translation\"])\n",
70 | "raw_datasets"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "print(raw_datasets[\"train\"][10])\n",
80 | "print(raw_datasets[\"train\"][11])\n",
81 | "print(raw_datasets[\"train\"][12])"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "from transformers import AutoTokenizer\n",
91 | "\n",
92 | "model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n",
93 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
94 | "\n",
95 | "sample = raw_datasets[\"train\"][12]\n",
96 | "inputs = tokenizer(sample[\"inputs\"])\n",
97 | "targets = tokenizer(sample[\"targets\"])\n",
98 | "\n",
99 | "\n",
100 | "print(tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"]))\n",
101 | "print(tokenizer.convert_ids_to_tokens(targets[\"input_ids\"]))"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "from transformers import AutoTokenizer\n",
111 | "\n",
112 | "model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n",
113 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
114 | "\n",
115 | "sample = raw_datasets[\"train\"][12]\n",
116 | "inputs = tokenizer(sample[\"inputs\"])\n",
117 | "with tokenizer.as_target_tokenizer():\n",
118 | " targets = tokenizer(sample[\"targets\"])\n",
119 | "\n",
120 | "\n",
121 | "print(tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"]))\n",
122 | "print(tokenizer.convert_ids_to_tokens(targets[\"input_ids\"]))"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "max_input_length = 128\n",
132 | "max_target_length = 128\n",
133 | "\n",
134 | "def preprocess_function(examples):\n",
135 | " model_inputs = tokenizer(examples[\"inputs\"], max_length=max_input_length, truncation=True)\n",
136 | "\n",
137 | " # Setup the tokenizer for targets\n",
138 | " with tokenizer.as_target_tokenizer():\n",
139 | " labels = tokenizer(examples[\"targets\"], max_length=max_target_length, truncation=True)\n",
140 | "\n",
141 | " model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
142 | " return model_inputs\n",
143 | "\n",
144 | "tokenized_datasets = raw_datasets.map(\n",
145 | " preprocess_function, batched=True, remove_columns=[\"inputs\", \"targets\"]\n",
146 | ")"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "from transformers import DataCollatorForSeq2Seq\n",
156 | "\n",
157 | "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": []
166 | }
167 | ],
168 | "metadata": {
169 | "colab": {
170 | "name": "Data processing for Translation",
171 | "provenance": []
172 | }
173 | },
174 | "nbformat": 4,
175 | "nbformat_minor": 4
176 | }
177 |
--------------------------------------------------------------------------------
/sagemaker/09_image_classification_vision_transformer/scripts/train.py:
--------------------------------------------------------------------------------
1 | from transformers import ViTForImageClassification, Trainer, TrainingArguments,default_data_collator,ViTFeatureExtractor
2 | from datasets import load_from_disk,load_metric
3 | import random
4 | import logging
5 | import sys
6 | import argparse
7 | import os
8 | import numpy as np
9 | import subprocess
10 |
11 | subprocess.run([
12 | "git",
13 | "config",
14 | "--global",
15 | "user.email",
16 | "sagemaker@huggingface.co",
17 | ], check=True)
18 | subprocess.run([
19 | "git",
20 | "config",
21 | "--global",
22 | "user.name",
23 | "sagemaker",
24 | ], check=True)
25 |
26 |
27 | if __name__ == "__main__":
28 |
29 | parser = argparse.ArgumentParser()
30 |
31 | # hyperparameters sent by the client are passed as command-line arguments to the script.
32 | parser.add_argument("--model_name", type=str)
33 | parser.add_argument("--output_dir", type=str,default="/opt/ml/model")
34 | parser.add_argument("--extra_model_name", type=str,default="sagemaker")
35 | parser.add_argument("--dataset", type=str,default="cifar10")
36 | parser.add_argument("--task", type=str,default="image-classification")
37 | parser.add_argument("--use_auth_token", type=str, default="")
38 |
39 | parser.add_argument("--num_train_epochs", type=int, default=3)
40 | parser.add_argument("--per_device_train_batch_size", type=int, default=32)
41 | parser.add_argument("--per_device_eval_batch_size", type=int, default=64)
42 | parser.add_argument("--warmup_steps", type=int, default=500)
43 | parser.add_argument("--weight_decay", type=float, default=0.01)
44 | parser.add_argument("--learning_rate", type=str, default=2e-5)
45 |
46 | parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
47 | parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
48 |
49 | args, _ = parser.parse_known_args()
50 |
51 | # Set up logging
52 | logger = logging.getLogger(__name__)
53 |
54 | logging.basicConfig(
55 | level=logging.getLevelName("INFO"),
56 | handlers=[logging.StreamHandler(sys.stdout)],
57 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
58 | )
59 |
60 | # load datasets
61 | train_dataset = load_from_disk(args.training_dir)
62 | test_dataset = load_from_disk(args.test_dir)
63 | num_classes = train_dataset.features["label"].num_classes
64 |
65 |
66 | logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
67 | logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
68 |
69 | metric_name = "accuracy"
70 | # compute metrics function for binary classification
71 |
72 | metric = load_metric(metric_name)
73 |
74 | def compute_metrics(eval_pred):
75 | predictions, labels = eval_pred
76 | predictions = np.argmax(predictions, axis=1)
77 | return metric.compute(predictions=predictions, references=labels)
78 |
79 | # download model from model hub
80 | model = ViTForImageClassification.from_pretrained(args.model_name,num_labels=num_classes)
81 |
82 | # change labels
83 | id2label = {key:train_dataset.features["label"].names[index] for index,key in enumerate(model.config.id2label.keys())}
84 | label2id = {train_dataset.features["label"].names[index]:value for index,value in enumerate(model.config.label2id.values())}
85 | model.config.id2label = id2label
86 | model.config.label2id = label2id
87 |
88 |
89 | # define training args
90 | training_args = TrainingArguments(
91 | output_dir=args.output_dir,
92 | num_train_epochs=args.num_train_epochs,
93 | per_device_train_batch_size=args.per_device_train_batch_size,
94 | per_device_eval_batch_size=args.per_device_eval_batch_size,
95 | warmup_steps=args.warmup_steps,
96 | weight_decay=args.weight_decay,
97 | evaluation_strategy="epoch",
98 | logging_dir=f"{args.output_dir}/logs",
99 | learning_rate=float(args.learning_rate),
100 | load_best_model_at_end=True,
101 | metric_for_best_model=metric_name,
102 | )
103 |
104 |
105 | # create Trainer instance
106 | trainer = Trainer(
107 | model=model,
108 | args=training_args,
109 | compute_metrics=compute_metrics,
110 | train_dataset=train_dataset,
111 | eval_dataset=test_dataset,
112 | data_collator=default_data_collator,
113 | )
114 |
115 | # train model
116 | trainer.train()
117 |
118 | # evaluate model
119 | eval_result = trainer.evaluate(eval_dataset=test_dataset)
120 |
121 | # writes eval result to file which can be accessed later in s3 ouput
122 | with open(os.path.join(args.output_dir, "eval_results.txt"), "w") as writer:
123 | print(f"***** Eval results *****")
124 | for key, value in sorted(eval_result.items()):
125 | writer.write(f"{key} = {value}\n")
126 |
127 | # Saves the model to s3
128 | trainer.save_model(args.output_dir)
129 |
130 | if args.use_auth_token != "":
131 | kwargs = {
132 | "finetuned_from": args.model_name.split("/")[1],
133 | "tags": "image-classification",
134 | "dataset": args.dataset,
135 | }
136 | repo_name = (
137 | f"{args.model_name.split('/')[1]}-{args.task}"
138 | if args.extra_model_name == ""
139 | else f"{args.model_name.split('/')[1]}-{args.task}-{args.extra_model_name}"
140 | )
141 |
142 | trainer.push_to_hub(
143 | repo_name=repo_name,
144 | use_auth_token=args.use_auth_token,
145 | **kwargs,
146 | )
147 |
--------------------------------------------------------------------------------
/course/videos/sentence_pairs_tf.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from transformers import AutoTokenizer\n",
61 | "\n",
62 | "checkpoint = \"bert-base-uncased\"\n",
63 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
64 | "sequences = [\n",
65 | " \"I've been waiting for a HuggingFace course my whole life.\",\n",
66 | " \"This course is amazing!\",\n",
67 | "]\n",
68 | "batch = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"tf\")"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [
76 | {
77 | "data": {
78 | "text/plain": [
79 | "{'input_ids': [101, 2026, 2171, 2003, 25353, 22144, 2378, 1012, 102, 1045, 2147, 2012, 17662, 2227, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
80 | ]
81 | },
82 | "execution_count": null,
83 | "metadata": {},
84 | "output_type": "execute_result"
85 | }
86 | ],
87 | "source": [
88 | "from transformers import AutoTokenizer\n",
89 | "\n",
90 | "checkpoint = \"bert-base-uncased\"\n",
91 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
92 | "tokenizer(\"My name is Sylvain.\", \"I work at Hugging Face.\")"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "data": {
102 | "text/plain": [
103 | "{'input_ids': [[101, 2026, 2171, 2003, 25353, 22144, 2378, 1012, 102, 1045, 2147, 2012, 17662, 2227, 1012, 102], [101, 2183, 2000, 1996, 5988, 1012, 102, 2023, 3185, 2003, 2307, 1012, 102, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]}"
104 | ]
105 | },
106 | "execution_count": null,
107 | "metadata": {},
108 | "output_type": "execute_result"
109 | }
110 | ],
111 | "source": [
112 | "from transformers import AutoTokenizer\n",
113 | "\n",
114 | "checkpoint = \"bert-base-uncased\"\n",
115 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
116 | "tokenizer(\n",
117 | " [\"My name is Sylvain.\", \"Going to the cinema.\"],\n",
118 | " [\"I work at Hugging Face.\", \"This movie is great.\"],\n",
119 | " padding=True\n",
120 | ")"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {},
127 | "outputs": [
128 | {
129 | "name": "stderr",
130 | "output_type": "stream",
131 | "text": [
132 | "All model checkpoint layers were used when initializing TFBertForSequenceClassification.\n",
133 | "\n",
134 | "Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']\n",
135 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
136 | ]
137 | }
138 | ],
139 | "source": [
140 | "from transformers import TFAutoModelForSequenceClassification, AutoTokenizer\n",
141 | "\n",
142 | "checkpoint = \"bert-base-uncased\"\n",
143 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
144 | "batch = tokenizer(\n",
145 | " [\"My name is Sylvain.\", \"Going to the cinema.\"],\n",
146 | " [\"I work at Hugging Face.\", \"This movie is great.\"],\n",
147 | " padding=True,\n",
148 | " return_tensors=\"tf\",\n",
149 | ")\n",
150 | "\n",
151 | "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
152 | "outputs = model(**batch)"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": []
161 | }
162 | ],
163 | "metadata": {
164 | "colab": {
165 | "name": "Preprocessing sentence pairs (TensorFlow)",
166 | "provenance": []
167 | }
168 | },
169 | "nbformat": 4,
170 | "nbformat_minor": 4
171 | }
172 |
--------------------------------------------------------------------------------
/course/videos/tensorflow_finetuning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {
65 | "cellView": "form"
66 | },
67 | "outputs": [
68 | {
69 | "data": {
70 | "text/html": [
71 | ""
72 | ],
73 | "text/plain": [
74 | ""
75 | ]
76 | },
77 | "execution_count": null,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "#@title\n",
84 | "from IPython.display import HTML\n",
85 | "\n",
86 | "HTML('')"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "Install the Transformers and Datasets libraries to run this notebook."
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "! pip install datasets transformers[sentencepiece]"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "from datasets import load_dataset\n",
112 | "from transformers import AutoTokenizer\n",
113 | "import numpy as np\n",
114 | "\n",
115 | "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
116 | "checkpoint = \"bert-base-uncased\"\n",
117 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
118 | "\n",
119 | "def tokenize_dataset(dataset):\n",
120 | " encoded = tokenizer(\n",
121 | " dataset[\"sentence1\"],\n",
122 | " dataset[\"sentence2\"],\n",
123 | " max_length=128,\n",
124 | " truncation=True,\n",
125 | " )\n",
126 | " return encoded.data\n",
127 | "\n",
128 | "tokenized_datasets = raw_datasets.map(tokenize_dataset, batched=True)\n",
129 | "\n",
130 | "train_dataset = tokenized_datasets[\"train\"].to_tf_dataset(\n",
131 | " columns=[\"input_ids\", \"attention_mask\", \"token_type_ids\"],\n",
132 | " label_cols=[\"label\"],\n",
133 | " shuffle=True,\n",
134 | " batch_size=8)\n",
135 | "\n",
136 | "validation_dataset = tokenized_datasets[\"validation\"].to_tf_dataset(\n",
137 | " columns=[\"input_ids\", \"attention_mask\", \"token_type_ids\"],\n",
138 | " label_cols=[\"label\"],\n",
139 | " shuffle=True,\n",
140 | " batch_size=8)"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "next(iter(train_dataset))[1]"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "import tensorflow as tf\n",
159 | "from transformers import TFAutoModelForSequenceClassification\n",
160 | "\n",
161 | "checkpoint = 'bert-base-cased'\n",
162 | "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
163 | "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
164 | "model.compile(optimizer='adam', loss=loss)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "model.fit(\n",
174 | " train_dataset,\n",
175 | " validation_data=validation_dataset,\n",
176 | " epochs=3\n",
177 | ")"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": []
186 | }
187 | ],
188 | "metadata": {
189 | "colab": {
190 | "name": "Fine-Tuning with TensorFlow",
191 | "provenance": []
192 | }
193 | },
194 | "nbformat": 4,
195 | "nbformat_minor": 4
196 | }
197 |
--------------------------------------------------------------------------------
/course/chapter3/section3_tf.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Fine-tuning a model with Keras"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "from datasets import load_dataset\n",
33 | "from transformers import AutoTokenizer, DataCollatorWithPadding\n",
34 | "import numpy as np\n",
35 | "\n",
36 | "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
37 | "checkpoint = \"bert-base-uncased\"\n",
38 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
39 | "\n",
40 | "\n",
41 | "def tokenize_function(example):\n",
42 | " return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n",
43 | "\n",
44 | "\n",
45 | "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
46 | "\n",
47 | "data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors=\"tf\")\n",
48 | "\n",
49 | "tf_train_dataset = tokenized_datasets[\"train\"].to_tf_dataset(\n",
50 | " columns=[\"attention_mask\", \"input_ids\", \"token_type_ids\"],\n",
51 | " label_cols=[\"labels\"],\n",
52 | " shuffle=True,\n",
53 | " collate_fn=data_collator,\n",
54 | " batch_size=8,\n",
55 | ")\n",
56 | "\n",
57 | "tf_validation_dataset = tokenized_datasets[\"validation\"].to_tf_dataset(\n",
58 | " columns=[\"attention_mask\", \"input_ids\", \"token_type_ids\"],\n",
59 | " label_cols=[\"labels\"],\n",
60 | " shuffle=False,\n",
61 | " collate_fn=data_collator,\n",
62 | " batch_size=8,\n",
63 | ")"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "from transformers import TFAutoModelForSequenceClassification\n",
73 | "\n",
74 | "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "from tensorflow.keras.losses import SparseCategoricalCrossentropy\n",
84 | "\n",
85 | "model.compile(\n",
86 | " optimizer=\"adam\",\n",
87 | " loss=SparseCategoricalCrossentropy(from_logits=True),\n",
88 | " metrics=[\"accuracy\"],\n",
89 | ")\n",
90 | "model.fit(\n",
91 | " tf_train_dataset,\n",
92 | " validation_data=tf_validation_dataset,\n",
93 | ")"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "from tensorflow.keras.optimizers.schedules import PolynomialDecay\n",
103 | "\n",
104 | "batch_size = 8\n",
105 | "num_epochs = 3\n",
106 | "# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied\n",
107 | "# by the total number of epochs\n",
108 | "num_train_steps = len(tf_train_dataset) * num_epochs\n",
109 | "lr_scheduler = PolynomialDecay(\n",
110 | " initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps\n",
111 | ")\n",
112 | "from tensorflow.keras.optimizers import Adam\n",
113 | "\n",
114 | "opt = Adam(learning_rate=lr_scheduler)"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "import tensorflow as tf\n",
124 | "\n",
125 | "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
126 | "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
127 | "model.compile(optimizer=opt, loss=loss, metrics=[\"accuracy\"])"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "preds = model.predict(tf_validation_dataset)[\"logits\"]"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [
153 | {
154 | "data": {
155 | "text/plain": [
156 | "(408, 2) (408,)"
157 | ]
158 | },
159 | "execution_count": null,
160 | "metadata": {},
161 | "output_type": "execute_result"
162 | }
163 | ],
164 | "source": [
165 | "class_preds = np.argmax(preds, axis=1)\n",
166 | "print(preds.shape, class_preds.shape)"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [
174 | {
175 | "data": {
176 | "text/plain": [
177 | "{'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}"
178 | ]
179 | },
180 | "execution_count": null,
181 | "metadata": {},
182 | "output_type": "execute_result"
183 | }
184 | ],
185 | "source": [
186 | "from datasets import load_metric\n",
187 | "\n",
188 | "metric = load_metric(\"glue\", \"mrpc\")\n",
189 | "metric.compute(predictions=class_preds, references=raw_datasets[\"validation\"][\"label\"])"
190 | ]
191 | }
192 | ],
193 | "metadata": {
194 | "colab": {
195 | "name": "Fine-tuning a model with Keras",
196 | "provenance": []
197 | }
198 | },
199 | "nbformat": 4,
200 | "nbformat_minor": 4
201 | }
202 |
--------------------------------------------------------------------------------
/course/videos/semantic_search.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "import torch\n",
61 | "from transformers import AutoTokenizer, AutoModel\n",
62 | "\n",
63 | "sentences = [\n",
64 | " \"I took my dog for a walk\",\n",
65 | " \"Today is going to rain\",\n",
66 | " \"I took my cat for a walk\",\n",
67 | "]\n",
68 | "\n",
69 | "model_ckpt = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
70 | "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
71 | "model = AutoModel.from_pretrained(model_ckpt)\n",
72 | "\n",
73 | "encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors=\"pt\")\n",
74 | "\n",
75 | "with torch.no_grad():\n",
76 | " model_output = model(**encoded_input)\n",
77 | " \n",
78 | " \n",
79 | "token_embeddings = model_output.last_hidden_state\n",
80 | "print(f\"Token embeddings shape: {token_embeddings.size()}\")"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "import torch.nn.functional as F\n",
90 | "\n",
91 | "\n",
92 | "def mean_pooling(model_output, attention_mask):\n",
93 | " token_embeddings = model_output.last_hidden_state\n",
94 | " input_mask_expanded = (\n",
95 | " attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n",
96 | " )\n",
97 | " return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(\n",
98 | " input_mask_expanded.sum(1), min=1e-9\n",
99 | " )\n",
100 | "\n",
101 | "\n",
102 | "sentence_embeddings = mean_pooling(model_output, encoded_input[\"attention_mask\"])\n",
103 | "# Normalize the embeddings\n",
104 | "sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)\n",
105 | "print(f\"Sentence embeddings shape: {sentence_embeddings.size()}\")"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "import numpy as np\n",
115 | "from sklearn.metrics.pairwise import cosine_similarity\n",
116 | "\n",
117 | "sentence_embeddings = sentence_embeddings.detach().numpy()\n",
118 | "\n",
119 | "scores = np.zeros((sentence_embeddings.shape[0], sentence_embeddings.shape[0]))\n",
120 | "\n",
121 | "for idx in range(sentence_embeddings.shape[0]):\n",
122 | " scores[idx, :] = cosine_similarity([sentence_embeddings[idx]], sentence_embeddings)[0]"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "from datasets import load_dataset\n",
132 | "\n",
133 | "squad = load_dataset(\"squad\", split=\"validation\").shuffle(seed=42).select(range(100))\n",
134 | "\n",
135 | "\n",
136 | "def get_embeddings(text_list):\n",
137 | " encoded_input = tokenizer(\n",
138 | " text_list, padding=True, truncation=True, return_tensors=\"pt\"\n",
139 | " )\n",
140 | " encoded_input = {k: v for k, v in encoded_input.items()}\n",
141 | " with torch.no_grad():\n",
142 | " model_output = model(**encoded_input)\n",
143 | " return mean_pooling(model_output, encoded_input[\"attention_mask\"])\n",
144 | "\n",
145 | "\n",
146 | "squad_with_embeddings = squad.map(\n",
147 | " lambda x: {\"embeddings\": get_embeddings(x[\"context\"]).cpu().numpy()[0]}\n",
148 | ")"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "squad_with_embeddings.add_faiss_index(column=\"embeddings\")\n",
158 | "\n",
159 | "question = \"Who headlined the halftime show for Super Bowl 50?\"\n",
160 | "question_embedding = get_embeddings([question]).cpu().detach().numpy()\n",
161 | "\n",
162 | "scores, samples = squad_with_embeddings.get_nearest_examples(\n",
163 | " \"embeddings\", question_embedding, k=3\n",
164 | ")"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "metadata": {},
171 | "outputs": [],
172 | "source": []
173 | }
174 | ],
175 | "metadata": {
176 | "colab": {
177 | "name": "Text embeddings & semantic search",
178 | "provenance": []
179 | }
180 | },
181 | "nbformat": 4,
182 | "nbformat_minor": 4
183 | }
184 |
--------------------------------------------------------------------------------
/course/chapter2/section6_pt.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Putting it all together (PyTorch)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "from transformers import AutoTokenizer\n",
33 | "\n",
34 | "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
35 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
36 | "\n",
37 | "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
38 | "\n",
39 | "model_inputs = tokenizer(sequence)"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
49 | "\n",
50 | "model_inputs = tokenizer(sequence)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
60 | "\n",
61 | "model_inputs = tokenizer(sequences)"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "# Will pad the sequences up to the maximum sequence length\n",
71 | "model_inputs = tokenizer(sequences, padding=\"longest\")\n",
72 | "\n",
73 | "# Will pad the sequences up to the model max length\n",
74 | "# (512 for BERT or DistilBERT)\n",
75 | "model_inputs = tokenizer(sequences, padding=\"max_length\")\n",
76 | "\n",
77 | "# Will pad the sequences up to the specified max length\n",
78 | "model_inputs = tokenizer(sequences, padding=\"max_length\", max_length=8)"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
88 | "\n",
89 | "# Will truncate the sequences that are longer than the model max length\n",
90 | "# (512 for BERT or DistilBERT)\n",
91 | "model_inputs = tokenizer(sequences, truncation=True)\n",
92 | "\n",
93 | "# Will truncate the sequences that are longer than the specified max length\n",
94 | "model_inputs = tokenizer(sequences, max_length=8, truncation=True)"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
104 | "\n",
105 | "# Returns PyTorch tensors\n",
106 | "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"pt\")\n",
107 | "\n",
108 | "# Returns TensorFlow tensors\n",
109 | "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"tf\")\n",
110 | "\n",
111 | "# Returns NumPy arrays\n",
112 | "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"np\")"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {},
119 | "outputs": [
120 | {
121 | "data": {
122 | "text/plain": [
123 | "[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]\n",
124 | "[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]"
125 | ]
126 | },
127 | "execution_count": null,
128 | "metadata": {},
129 | "output_type": "execute_result"
130 | }
131 | ],
132 | "source": [
133 | "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
134 | "\n",
135 | "model_inputs = tokenizer(sequence)\n",
136 | "print(model_inputs[\"input_ids\"])\n",
137 | "\n",
138 | "tokens = tokenizer.tokenize(sequence)\n",
139 | "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
140 | "print(ids)"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {},
147 | "outputs": [
148 | {
149 | "data": {
150 | "text/plain": [
151 | "\"[CLS] i've been waiting for a huggingface course my whole life. [SEP]\"\n",
152 | "\"i've been waiting for a huggingface course my whole life.\""
153 | ]
154 | },
155 | "execution_count": null,
156 | "metadata": {},
157 | "output_type": "execute_result"
158 | }
159 | ],
160 | "source": [
161 | "print(tokenizer.decode(model_inputs[\"input_ids\"]))\n",
162 | "print(tokenizer.decode(ids))"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "import torch\n",
172 | "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
173 | "\n",
174 | "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
175 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
176 | "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
177 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
178 | "\n",
179 | "tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"pt\")\n",
180 | "output = model(**tokens)"
181 | ]
182 | }
183 | ],
184 | "metadata": {
185 | "colab": {
186 | "name": "Putting it all together (PyTorch)",
187 | "provenance": []
188 | }
189 | },
190 | "nbformat": 4,
191 | "nbformat_minor": 4
192 | }
193 |
--------------------------------------------------------------------------------
/course/chapter2/section6_tf.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Putting it all together (TensorFlow)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install the Transformers and Datasets libraries to run this notebook."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "!pip install datasets transformers[sentencepiece]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "from transformers import AutoTokenizer\n",
33 | "\n",
34 | "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
35 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
36 | "\n",
37 | "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
38 | "\n",
39 | "model_inputs = tokenizer(sequence)"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
49 | "\n",
50 | "model_inputs = tokenizer(sequence)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
60 | "\n",
61 | "model_inputs = tokenizer(sequences)"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "# Will pad the sequences up to the maximum sequence length\n",
71 | "model_inputs = tokenizer(sequences, padding=\"longest\")\n",
72 | "\n",
73 | "# Will pad the sequences up to the model max length\n",
74 | "# (512 for BERT or DistilBERT)\n",
75 | "model_inputs = tokenizer(sequences, padding=\"max_length\")\n",
76 | "\n",
77 | "# Will pad the sequences up to the specified max length\n",
78 | "model_inputs = tokenizer(sequences, padding=\"max_length\", max_length=8)"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
88 | "\n",
89 | "# Will truncate the sequences that are longer than the model max length\n",
90 | "# (512 for BERT or DistilBERT)\n",
91 | "model_inputs = tokenizer(sequences, truncation=True)\n",
92 | "\n",
93 | "# Will truncate the sequences that are longer than the specified max length\n",
94 | "model_inputs = tokenizer(sequences, max_length=8, truncation=True)"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
104 | "\n",
105 | "# Returns PyTorch tensors\n",
106 | "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"pt\")\n",
107 | "\n",
108 | "# Returns TensorFlow tensors\n",
109 | "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"tf\")\n",
110 | "\n",
111 | "# Returns NumPy arrays\n",
112 | "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"np\")"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {},
119 | "outputs": [
120 | {
121 | "data": {
122 | "text/plain": [
123 | "[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]\n",
124 | "[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]"
125 | ]
126 | },
127 | "execution_count": null,
128 | "metadata": {},
129 | "output_type": "execute_result"
130 | }
131 | ],
132 | "source": [
133 | "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
134 | "\n",
135 | "model_inputs = tokenizer(sequence)\n",
136 | "print(model_inputs[\"input_ids\"])\n",
137 | "\n",
138 | "tokens = tokenizer.tokenize(sequence)\n",
139 | "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
140 | "print(ids)"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {},
147 | "outputs": [
148 | {
149 | "data": {
150 | "text/plain": [
151 | "\"[CLS] i've been waiting for a huggingface course my whole life. [SEP]\"\n",
152 | "\"i've been waiting for a huggingface course my whole life.\""
153 | ]
154 | },
155 | "execution_count": null,
156 | "metadata": {},
157 | "output_type": "execute_result"
158 | }
159 | ],
160 | "source": [
161 | "print(tokenizer.decode(model_inputs[\"input_ids\"]))\n",
162 | "print(tokenizer.decode(ids))"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "import tensorflow as tf\n",
172 | "from transformers import AutoTokenizer, TFAutoModelForSequenceClassification\n",
173 | "\n",
174 | "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
175 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
176 | "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
177 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
178 | "\n",
179 | "tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"tf\")\n",
180 | "output = model(**tokens)"
181 | ]
182 | }
183 | ],
184 | "metadata": {
185 | "colab": {
186 | "name": "Putting it all together (TensorFlow)",
187 | "provenance": []
188 | }
189 | },
190 | "nbformat": 4,
191 | "nbformat_minor": 4
192 | }
193 |
--------------------------------------------------------------------------------
/course/videos/token_pipeline_pt.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "cellView": "form"
15 | },
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | ""
21 | ],
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": null,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "#@title\n",
33 | "from IPython.display import HTML\n",
34 | "\n",
35 | "HTML('')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Install the Transformers and Datasets libraries to run this notebook."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "! pip install datasets transformers[sentencepiece]"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from transformers import pipeline\n",
61 | "\n",
62 | "token_classifier = pipeline(\"token-classification\")\n",
63 | "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "token_classifier = pipeline(\"token-classification\", aggregation_strategy=\"simple\")\n",
73 | "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "from transformers import AutoTokenizer, AutoModelForTokenClassification\n",
83 | "\n",
84 | "model_checkpoint = \"\"\n",
85 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
86 | "model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)\n",
87 | "\n",
88 | "example = \"My name is Sylvain and I work at Hugging Face in Brooklyn.\"\n",
89 | "inputs = tokenizer(example, return_tensors=\"pt\")\n",
90 | "outputs = model(**inputs)\n",
91 | "\n",
92 | "print(inputs[\"input_ids\"].shape)\n",
93 | "print(outputs.logits.shape)"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "import torch\n",
103 | "\n",
104 | "probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()\n",
105 | "predictions = probabilities.argmax(dim=-1)[0].tolist()\n",
106 | "print(predictions)"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "model.config.id2label"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "results = []\n",
125 | "inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)\n",
126 | "tokens = inputs_with_offsets.tokens()\n",
127 | "offsets = inputs_with_offsets[\"offset_mapping\"]\n",
128 | "\n",
129 | "for idx, pred in enumerate(predictions):\n",
130 | " label = model.config.id2label[pred]\n",
131 | " if label != \"O\":\n",
132 | " start, end = offsets[idx]\n",
133 | " results.append(\n",
134 | " {\"entity\": label, \"score\": probabilities[idx][pred],\n",
135 | " \"word\": tokens[idx], \"start\": start, \"end\": end}\n",
136 | " )\n",
137 | "\n",
138 | "print(results)"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "import numpy as np\n",
148 | "\n",
149 | "label_map = model.config.id2label\n",
150 | "results = []\n",
151 | "idx = 0\n",
152 | "while idx < len(predictions):\n",
153 | " pred = predictions[idx]\n",
154 | " label = label_map[pred]\n",
155 | " if label != \"O\":\n",
156 | " # Remove the B- or I-\n",
157 | " label = label[2:]\n",
158 | " start, _ = offsets[idx]\n",
159 | "\n",
160 | " # Grab all the tokens labeled with I-label\n",
161 | " all_scores = []\n",
162 | " while idx < len(predictions) and label_map[predictions[idx]] == f\"I-{label}\":\n",
163 | " all_scores.append(probabilities[idx][pred])\n",
164 | " _, end = offsets[idx]\n",
165 | " idx += 1\n",
166 | "\n",
167 | " # The score is the mean of all the scores of the token in that grouped entity.\n",
168 | " score = np.mean(all_scores).item()\n",
169 | " word = example[start:end]\n",
170 | " results.append(\n",
171 | " {\"entity_group\": label, \"score\": score,\n",
172 | " \"word\": word, \"start\": start, \"end\": end}\n",
173 | " )\n",
174 | " idx += 1"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": []
183 | }
184 | ],
185 | "metadata": {
186 | "colab": {
187 | "name": "Inside the Token classification pipeline (PyTorch)",
188 | "provenance": []
189 | }
190 | },
191 | "nbformat": 4,
192 | "nbformat_minor": 4
193 | }
194 |
--------------------------------------------------------------------------------