├── .gitignore ├── README.md ├── sagemaker ├── 14_train_and_push_to_hub │ ├── README.md │ ├── imgs │ │ └── emotion-widget.png │ └── scripts │ │ └── train.py ├── 15_training_compiler │ ├── imgs │ │ └── emotion-widget.png │ └── scripts │ │ └── train.py ├── 13_deploy_and_autoscaling_transformers │ └── imgs │ │ ├── sm-endpoint.png │ │ ├── scaling-options.jpeg │ │ ├── autoscaling-endpoint.png │ │ ├── hf-inference-toolkit.png │ │ └── model-monitoring-dashboard.png ├── 01_getting_started_pytorch │ └── scripts │ │ └── train.py ├── 06_sagemaker_metrics │ └── scripts │ │ └── train.py ├── 05_spot_instances │ └── scripts │ │ └── train.py ├── 02_getting_started_tensorflow │ └── scripts │ │ └── train.py └── 09_image_classification_vision_transformer │ └── scripts │ └── train.py ├── Makefile ├── examples └── images │ ├── translation.png │ ├── summarization.png │ ├── model_parameters.png │ ├── question_answering.png │ ├── text_classification.png │ ├── token_classification.png │ ├── causal_language_modeling.png │ └── masked_language_modeling.png ├── longform-qa └── images │ ├── fireworks.gif │ ├── ELI5animation.gif │ └── huggingface_logo.jpg ├── transformers_doc ├── imgs │ ├── ppl_full.gif │ ├── ppl_chunked.gif │ └── ppl_sliding.gif └── README.md └── course ├── chapter8 ├── section5.ipynb └── section3.ipynb ├── chapter1 └── section8.ipynb ├── videos ├── pre_tokenization.ipynb ├── rouge_metric.ipynb ├── perplexity.ipynb ├── normalization.ipynb ├── offset_mapping.ipynb ├── domain_adaptation.ipynb ├── bleu_metric.ipynb ├── datasets_and_dataframes.ipynb ├── fast_tokenizers.ipynb ├── debug_error.ipynb ├── summarization_processing.ipynb ├── clm_processing.ipynb ├── load_custom_dataset.ipynb ├── debug_training_tf.ipynb ├── save_load_dataset.ipynb ├── mlm_processing.ipynb ├── building_tokenizer.ipynb ├── memory_mapping_streaming.ipynb ├── train_new_tokenizer.ipynb ├── slice_and_dice.ipynb ├── custom_loss.ipynb ├── token_processing.ipynb ├── translation_processing.ipynb ├── sentence_pairs_tf.ipynb ├── tensorflow_finetuning.ipynb ├── semantic_search.ipynb └── token_pipeline_pt.ipynb ├── chapter4 ├── section2_pt.ipynb └── section2_tf.ipynb ├── chapter2 ├── section3_pt.ipynb ├── section3_tf.ipynb ├── section4_pt.ipynb ├── section4_tf.ipynb ├── section6_pt.ipynb └── section6_tf.ipynb ├── chapter6 └── section4.ipynb ├── chapter5 └── section2.ipynb └── chapter3 ├── section3.ipynb └── section3_tf.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Jupyter Notebook 2 | .ipynb_checkpoints 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # notebooks 2 | Notebooks using the Hugging Face libraries 🤗 3 | -------------------------------------------------------------------------------- /sagemaker/14_train_and_push_to_hub/README.md: -------------------------------------------------------------------------------- 1 | # SageMaker push to hf.co/models example -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: doc-notebooks 2 | 3 | doc-notebooks: 4 | python utils/convert_doc_to_notebooks.py 5 | -------------------------------------------------------------------------------- /examples/images/translation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/translation.png -------------------------------------------------------------------------------- /longform-qa/images/fireworks.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/longform-qa/images/fireworks.gif -------------------------------------------------------------------------------- /examples/images/summarization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/summarization.png -------------------------------------------------------------------------------- /transformers_doc/imgs/ppl_full.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/transformers_doc/imgs/ppl_full.gif -------------------------------------------------------------------------------- /examples/images/model_parameters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/model_parameters.png -------------------------------------------------------------------------------- /longform-qa/images/ELI5animation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/longform-qa/images/ELI5animation.gif -------------------------------------------------------------------------------- /transformers_doc/imgs/ppl_chunked.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/transformers_doc/imgs/ppl_chunked.gif -------------------------------------------------------------------------------- /transformers_doc/imgs/ppl_sliding.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/transformers_doc/imgs/ppl_sliding.gif -------------------------------------------------------------------------------- /examples/images/question_answering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/question_answering.png -------------------------------------------------------------------------------- /examples/images/text_classification.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/text_classification.png -------------------------------------------------------------------------------- /longform-qa/images/huggingface_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/longform-qa/images/huggingface_logo.jpg -------------------------------------------------------------------------------- /examples/images/token_classification.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/token_classification.png -------------------------------------------------------------------------------- /examples/images/causal_language_modeling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/causal_language_modeling.png -------------------------------------------------------------------------------- /examples/images/masked_language_modeling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/examples/images/masked_language_modeling.png -------------------------------------------------------------------------------- /sagemaker/15_training_compiler/imgs/emotion-widget.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/15_training_compiler/imgs/emotion-widget.png -------------------------------------------------------------------------------- /sagemaker/14_train_and_push_to_hub/imgs/emotion-widget.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/14_train_and_push_to_hub/imgs/emotion-widget.png -------------------------------------------------------------------------------- /sagemaker/13_deploy_and_autoscaling_transformers/imgs/sm-endpoint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/sm-endpoint.png -------------------------------------------------------------------------------- /sagemaker/13_deploy_and_autoscaling_transformers/imgs/scaling-options.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/scaling-options.jpeg -------------------------------------------------------------------------------- /sagemaker/13_deploy_and_autoscaling_transformers/imgs/autoscaling-endpoint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/autoscaling-endpoint.png -------------------------------------------------------------------------------- /sagemaker/13_deploy_and_autoscaling_transformers/imgs/hf-inference-toolkit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/hf-inference-toolkit.png -------------------------------------------------------------------------------- /sagemaker/13_deploy_and_autoscaling_transformers/imgs/model-monitoring-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/notebooks/master/sagemaker/13_deploy_and_autoscaling_transformers/imgs/model-monitoring-dashboard.png -------------------------------------------------------------------------------- /transformers_doc/README.md: -------------------------------------------------------------------------------- 1 | # 🤗 Transformers doc notebooks 2 | 3 | These notebooks are automatically generated from the [🤗 Transformers documentation](https://huggingface.co/transformers/) 4 | so you should not make any direct modification here. If there is a typo to fix or a sentence to add, open a pull 5 | request in the [🤗 Transformers repo](https://github.com/huggingface/transformers) and fix the corresponding file in 6 | the `docs/source/` folder. 7 | 8 | If there is something that seems weirdly converted from the original doc file, open an issue in this repo and we will 9 | try to fix the conversion script. -------------------------------------------------------------------------------- /course/chapter8/section5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# How to write a good issue" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [] 32 | } 33 | ], 34 | "metadata": { 35 | "colab": { 36 | "name": "How to write a good issue", 37 | "provenance": [] 38 | } 39 | }, 40 | "nbformat": 4, 41 | "nbformat_minor": 4 42 | } 43 | -------------------------------------------------------------------------------- /course/chapter1/section8.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Bias and limitations" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "['lawyer', 'carpenter', 'doctor', 'waiter', 'mechanic']\n", 35 | "['nurse', 'waitress', 'teacher', 'maid', 'prostitute']" 36 | ] 37 | }, 38 | "execution_count": null, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "from transformers import pipeline\n", 45 | "\n", 46 | "unmasker = pipeline(\"fill-mask\", model=\"bert-base-uncased\")\n", 47 | "result = unmasker(\"This man works as a [MASK].\")\n", 48 | "print([r[\"token_str\"] for r in result])\n", 49 | "\n", 50 | "result = unmasker(\"This woman works as a [MASK].\")\n", 51 | "print([r[\"token_str\"] for r in result])" 52 | ] 53 | } 54 | ], 55 | "metadata": { 56 | "colab": { 57 | "name": "Bias and limitations", 58 | "provenance": [] 59 | } 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 4 63 | } 64 | -------------------------------------------------------------------------------- /course/videos/pre_tokenization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from transformers import AutoTokenizerFast\n", 61 | "\n", 62 | "tokenizer = AutoTokenizerFast.from_pretrained('albert-base-v1’)\n", 63 | "\n", 64 | "text = \"3.2.1: let's get started!\"\n", 65 | "\n", 66 | "print(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text))" 67 | ] 68 | } 69 | ], 70 | "metadata": { 71 | "colab": { 72 | "name": "What is pre-tokenization?", 73 | "provenance": [] 74 | } 75 | }, 76 | "nbformat": 4, 77 | "nbformat_minor": 4 78 | } 79 | -------------------------------------------------------------------------------- /course/videos/rouge_metric.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "! pip install nltk rouge_score" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "from datasets import load_metric\n", 70 | "\n", 71 | "rouge = load_metric(\"rouge\")\n", 72 | "predictions = [\"I really loved reading the Hunger Games\"]\n", 73 | "references = [\"I loved reading the Hunger Games\"]\n", 74 | "rouge.compute(predictions=predictions, references=references)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [] 83 | } 84 | ], 85 | "metadata": { 86 | "colab": { 87 | "name": "What is the ROUGE metric?", 88 | "provenance": [] 89 | } 90 | }, 91 | "nbformat": 4, 92 | "nbformat_minor": 4 93 | } 94 | -------------------------------------------------------------------------------- /course/videos/perplexity.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n", 61 | "import torch\n", 62 | "\n", 63 | "model = AutoModelForCausalLM.from_pretrained(\"gpt2\")\n", 64 | "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", 65 | "\n", 66 | "inputs = tokenizer(\"Hugging Face is a startup based in New York City and Paris\",\n", 67 | " return_tensors=\"pt\")\n", 68 | "\n", 69 | "loss = model(input_ids=inputs[\"input_ids\"],\n", 70 | " labels=inputs[\"input_ids\"]).loss\n", 71 | "\n", 72 | "ppl = torch.exp(loss)\n", 73 | "\n", 74 | "print(f\"Perplexity: {ppl.item():.2f}\")" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [] 83 | } 84 | ], 85 | "metadata": { 86 | "colab": { 87 | "name": "What is perplexity?", 88 | "provenance": [] 89 | } 90 | }, 91 | "nbformat": 4, 92 | "nbformat_minor": 4 93 | } 94 | -------------------------------------------------------------------------------- /course/videos/normalization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from transformers import AutoTokenizer\n", 61 | "\n", 62 | "text = \"This is a text with àccënts and CAPITAL LETTERS\"\n", 63 | "\n", 64 | "tokenizer = AutoTokenizer.from_pretrained(\"albert-large-v2\")\n", 65 | "print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))\n", 66 | "\n", 67 | "tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/albert-tokenizer-without-normalizer\")\n", 68 | "print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "text = \"un père indigné\"\n", 78 | "\n", 79 | "tokenizer = AutoTokenizerFast.from_pretrained('distilbert-base-uncased')\n", 80 | "print(tokenizer.backend_tokenizer.normalizer.normalize_str(text))" 81 | ] 82 | } 83 | ], 84 | "metadata": { 85 | "colab": { 86 | "name": "What is normalization?", 87 | "provenance": [] 88 | } 89 | }, 90 | "nbformat": 4, 91 | "nbformat_minor": 4 92 | } 93 | -------------------------------------------------------------------------------- /course/chapter4/section2_pt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Using pretrained models (PyTorch)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "[\n", 35 | " {'sequence': 'Le camembert est délicieux :)', 'score': 0.49091005325317383, 'token': 7200, 'token_str': 'délicieux'}, \n", 36 | " {'sequence': 'Le camembert est excellent :)', 'score': 0.1055697426199913, 'token': 2183, 'token_str': 'excellent'}, \n", 37 | " {'sequence': 'Le camembert est succulent :)', 'score': 0.03453313186764717, 'token': 26202, 'token_str': 'succulent'}, \n", 38 | " {'sequence': 'Le camembert est meilleur :)', 'score': 0.0330314114689827, 'token': 528, 'token_str': 'meilleur'}, \n", 39 | " {'sequence': 'Le camembert est parfait :)', 'score': 0.03007650189101696, 'token': 1654, 'token_str': 'parfait'}\n", 40 | "]" 41 | ] 42 | }, 43 | "execution_count": null, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "from transformers import pipeline\n", 50 | "\n", 51 | "camembert_fill_mask = pipeline(\"fill-mask\", model=\"camembert-base\")\n", 52 | "results = camembert_fill_mask(\"Le camembert est :)\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "from transformers import CamembertTokenizer, CamembertForMaskedLM\n", 62 | "\n", 63 | "tokenizer = CamembertTokenizer.from_pretrained(\"camembert-base\")\n", 64 | "model = CamembertForMaskedLM.from_pretrained(\"camembert-base\")" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "from transformers import AutoTokenizer, AutoModelForMaskedLM\n", 74 | "\n", 75 | "tokenizer = AutoTokenizer.from_pretrained(\"camembert-base\")\n", 76 | "model = AutoModelForMaskedLM.from_pretrained(\"camembert-base\")" 77 | ] 78 | } 79 | ], 80 | "metadata": { 81 | "colab": { 82 | "name": "Using pretrained models (PyTorch)", 83 | "provenance": [] 84 | } 85 | }, 86 | "nbformat": 4, 87 | "nbformat_minor": 4 88 | } 89 | -------------------------------------------------------------------------------- /course/chapter4/section2_tf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Using pretrained models (TensorFlow)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "[\n", 35 | " {'sequence': 'Le camembert est délicieux :)', 'score': 0.49091005325317383, 'token': 7200, 'token_str': 'délicieux'}, \n", 36 | " {'sequence': 'Le camembert est excellent :)', 'score': 0.1055697426199913, 'token': 2183, 'token_str': 'excellent'}, \n", 37 | " {'sequence': 'Le camembert est succulent :)', 'score': 0.03453313186764717, 'token': 26202, 'token_str': 'succulent'}, \n", 38 | " {'sequence': 'Le camembert est meilleur :)', 'score': 0.0330314114689827, 'token': 528, 'token_str': 'meilleur'}, \n", 39 | " {'sequence': 'Le camembert est parfait :)', 'score': 0.03007650189101696, 'token': 1654, 'token_str': 'parfait'}\n", 40 | "]" 41 | ] 42 | }, 43 | "execution_count": null, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "from transformers import pipeline\n", 50 | "\n", 51 | "camembert_fill_mask = pipeline(\"fill-mask\", model=\"camembert-base\")\n", 52 | "results = camembert_fill_mask(\"Le camembert est :)\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "from transformers import CamembertTokenizer, TFCamembertForMaskedLM\n", 62 | "\n", 63 | "tokenizer = CamembertTokenizer.from_pretrained(\"camembert-base\")\n", 64 | "model = TFCamembertForMaskedLM.from_pretrained(\"camembert-base\")" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "from transformers import AutoTokenizer, TFAutoModelForMaskedLM\n", 74 | "\n", 75 | "tokenizer = AutoTokenizer.from_pretrained(\"camembert-base\")\n", 76 | "model = TFAutoModelForMaskedLM.from_pretrained(\"camembert-base\")" 77 | ] 78 | } 79 | ], 80 | "metadata": { 81 | "colab": { 82 | "name": "Using pretrained models (TensorFlow)", 83 | "provenance": [] 84 | } 85 | }, 86 | "nbformat": 4, 87 | "nbformat_minor": 4 88 | } 89 | -------------------------------------------------------------------------------- /course/videos/offset_mapping.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from transformers import AutoTokenizer\n", 61 | "\n", 62 | "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n", 63 | "print(tokenizer(\"Let's talk about tokenizers superpowers.\")[\"input_ids\"])\n", 64 | "print(tokenizer(\"Let's talk about tokenizers superpowers.\")[\"input_ids\"])" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "encoding = tokenizer(\"Let's talk about tokenizers superpowers.\")\n", 74 | "print(encoding.tokens())\n", 75 | "print(encoding.word_ids())" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "encoding = tokenizer(\n", 85 | " \"Let's talk about tokenizers superpowers.\",\n", 86 | " return_offsets_mapping=True\n", 87 | ")\n", 88 | "print(encoding.tokens())\n", 89 | "print(encoding[\"offset_mapping\"])" 90 | ] 91 | } 92 | ], 93 | "metadata": { 94 | "colab": { 95 | "name": "Fast tokenizer superpowers", 96 | "provenance": [] 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 4 101 | } 102 | -------------------------------------------------------------------------------- /course/videos/domain_adaptation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from transformers import pipeline" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "model_checkpoint = \"distilbert-base-uncased\"\n", 70 | "fill_masker = pipeline(\"fill-mask\", model=model_checkpoint)\n", 71 | "fill_masker(\"This is a great [MASK].\")" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "model_checkpoint = \"huggingface-course/distilbert-base-uncased-finetuned-imdb\"\n", 81 | "fill_masker = pipeline(\"fill-mask\", model=model_checkpoint)\n", 82 | "fill_masker(\"This is a great [MASK].\")" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n", 92 | "translator = pipeline(\"translation\", model=model_checkpoint)\n", 93 | "translator(\"This plugin automatically translates emails.\")" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "model_checkpoint = \"huggingface-course/marian-finetuned-kde4-en-to-fr\")\n", 103 | "translator = pipeline(\"translation\", model=model_checkpoint)\n", 104 | "translator(\"This plugin automatically translates emails.\")" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [] 113 | } 114 | ], 115 | "metadata": { 116 | "colab": { 117 | "name": "What is domain adaptation?", 118 | "provenance": [] 119 | } 120 | }, 121 | "nbformat": 4, 122 | "nbformat_minor": 4 123 | } 124 | -------------------------------------------------------------------------------- /course/chapter2/section3_pt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Models (PyTorch)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from transformers import BertConfig, BertModel\n", 33 | "\n", 34 | "# Building the config\n", 35 | "config = BertConfig()\n", 36 | "\n", 37 | "# Building the model from the config\n", 38 | "model = BertModel(config)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "BertConfig {\n", 50 | " [...]\n", 51 | " \"hidden_size\": 768,\n", 52 | " \"intermediate_size\": 3072,\n", 53 | " \"max_position_embeddings\": 512,\n", 54 | " \"num_attention_heads\": 12,\n", 55 | " \"num_hidden_layers\": 12,\n", 56 | " [...]\n", 57 | "}" 58 | ] 59 | }, 60 | "execution_count": null, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "print(config)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "from transformers import BertConfig, BertModel\n", 76 | "\n", 77 | "config = BertConfig()\n", 78 | "model = BertModel(config)\n", 79 | "\n", 80 | "# Model is randomly initialized!" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "from transformers import BertModel\n", 90 | "\n", 91 | "model = BertModel.from_pretrained(\"bert-base-cased\")" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "model.save_pretrained(\"directory_on_my_computer\")" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "sequences = [\"Hello!\", \"Cool.\", \"Nice!\"]" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "encoded_sequences = [\n", 119 | " [101, 7592, 999, 102],\n", 120 | " [101, 4658, 1012, 102],\n", 121 | " [101, 3835, 999, 102],\n", 122 | "]" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "import torch\n", 132 | "\n", 133 | "model_inputs = torch.tensor(encoded_sequences)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "output = model(model_inputs)" 143 | ] 144 | } 145 | ], 146 | "metadata": { 147 | "colab": { 148 | "name": "Models (PyTorch)", 149 | "provenance": [] 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 4 154 | } 155 | -------------------------------------------------------------------------------- /course/chapter6/section4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Normalization and pre-tokenization" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "" 35 | ] 36 | }, 37 | "execution_count": null, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "from transformers import AutoTokenizer\n", 44 | "\n", 45 | "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n", 46 | "print(type(tokenizer.backend_tokenizer))" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "'hello how are u?'" 58 | ] 59 | }, 60 | "execution_count": null, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "print(tokenizer.backend_tokenizer.normalizer.normalize_str(\"Héllò hôw are ü?\"))" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "[('Hello', (0, 5)), (',', (5, 6)), ('how', (7, 10)), ('are', (11, 14)), ('you', (16, 19)), ('?', (19, 20))]" 78 | ] 79 | }, 80 | "execution_count": null, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Hello, how are you?\")" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/plain": [ 97 | "[('Hello', (0, 5)), (',', (5, 6)), ('Ġhow', (6, 10)), ('Ġare', (10, 14)), ('Ġ', (14, 15)), ('Ġyou', (15, 19)),\n", 98 | " ('?', (19, 20))]" 99 | ] 100 | }, 101 | "execution_count": null, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", 108 | "tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Hello, how are you?\")" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/plain": [ 119 | "[('▁Hello,', (0, 6)), ('▁how', (7, 10)), ('▁are', (11, 14)), ('▁you?', (16, 20))]" 120 | ] 121 | }, 122 | "execution_count": null, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "tokenizer = AutoTokenizer.from_pretrained(\"t5-small\")\n", 129 | "tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Hello, how are you?\")" 130 | ] 131 | } 132 | ], 133 | "metadata": { 134 | "colab": { 135 | "name": "Normalization and pre-tokenization", 136 | "provenance": [] 137 | } 138 | }, 139 | "nbformat": 4, 140 | "nbformat_minor": 4 141 | } 142 | -------------------------------------------------------------------------------- /course/chapter2/section3_tf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Models (TensorFlow)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from transformers import BertConfig, TFBertModel\n", 33 | "\n", 34 | "# Building the config\n", 35 | "config = BertConfig()\n", 36 | "\n", 37 | "# Building the model from the config\n", 38 | "model = TFBertModel(config)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "BertConfig {\n", 50 | " [...]\n", 51 | " \"hidden_size\": 768,\n", 52 | " \"intermediate_size\": 3072,\n", 53 | " \"max_position_embeddings\": 512,\n", 54 | " \"num_attention_heads\": 12,\n", 55 | " \"num_hidden_layers\": 12,\n", 56 | " [...]\n", 57 | "}" 58 | ] 59 | }, 60 | "execution_count": null, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "print(config)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "from transformers import BertConfig, TFBertModel\n", 76 | "\n", 77 | "config = BertConfig()\n", 78 | "model = TFBertModel(config)\n", 79 | "\n", 80 | "# Model is randomly initialized!" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "from transformers import TFBertModel\n", 90 | "\n", 91 | "model = TFBertModel.from_pretrained(\"bert-base-cased\")" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "model.save_pretrained(\"directory_on_my_computer\")" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "sequences = [\"Hello!\", \"Cool.\", \"Nice!\"]" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "encoded_sequences = [\n", 119 | " [101, 7592, 999, 102],\n", 120 | " [101, 4658, 1012, 102],\n", 121 | " [101, 3835, 999, 102],\n", 122 | "]" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "import tensorflow as tf\n", 132 | "\n", 133 | "model_inputs = tf.constant(encoded_sequences)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "output = model(model_inputs)" 143 | ] 144 | } 145 | ], 146 | "metadata": { 147 | "colab": { 148 | "name": "Models (TensorFlow)", 149 | "provenance": [] 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 4 154 | } 155 | -------------------------------------------------------------------------------- /sagemaker/01_getting_started_pytorch/scripts/train.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer 2 | from sklearn.metrics import accuracy_score, precision_recall_fscore_support 3 | from datasets import load_from_disk 4 | import random 5 | import logging 6 | import sys 7 | import argparse 8 | import os 9 | import torch 10 | 11 | if __name__ == "__main__": 12 | 13 | parser = argparse.ArgumentParser() 14 | 15 | # hyperparameters sent by the client are passed as command-line arguments to the script. 16 | parser.add_argument("--epochs", type=int, default=3) 17 | parser.add_argument("--train_batch_size", type=int, default=32) 18 | parser.add_argument("--eval_batch_size", type=int, default=64) 19 | parser.add_argument("--warmup_steps", type=int, default=500) 20 | parser.add_argument("--model_name", type=str) 21 | parser.add_argument("--learning_rate", type=str, default=5e-5) 22 | 23 | # Data, model, and output directories 24 | parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) 25 | parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"]) 26 | parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"]) 27 | parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) 28 | parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"]) 29 | 30 | args, _ = parser.parse_known_args() 31 | 32 | # Set up logging 33 | logger = logging.getLogger(__name__) 34 | 35 | logging.basicConfig( 36 | level=logging.getLevelName("INFO"), 37 | handlers=[logging.StreamHandler(sys.stdout)], 38 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 39 | ) 40 | 41 | # load datasets 42 | train_dataset = load_from_disk(args.training_dir) 43 | test_dataset = load_from_disk(args.test_dir) 44 | 45 | logger.info(f" loaded train_dataset length is: {len(train_dataset)}") 46 | logger.info(f" loaded test_dataset length is: {len(test_dataset)}") 47 | 48 | # compute metrics function for binary classification 49 | def compute_metrics(pred): 50 | labels = pred.label_ids 51 | preds = pred.predictions.argmax(-1) 52 | precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary") 53 | acc = accuracy_score(labels, preds) 54 | return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall} 55 | 56 | # download model from model hub 57 | model = AutoModelForSequenceClassification.from_pretrained(args.model_name) 58 | tokenizer = AutoTokenizer.from_pretrained(args.model_name) 59 | 60 | # define training args 61 | training_args = TrainingArguments( 62 | output_dir=args.model_dir, 63 | num_train_epochs=args.epochs, 64 | per_device_train_batch_size=args.train_batch_size, 65 | per_device_eval_batch_size=args.eval_batch_size, 66 | warmup_steps=args.warmup_steps, 67 | evaluation_strategy="epoch", 68 | logging_dir=f"{args.output_data_dir}/logs", 69 | learning_rate=float(args.learning_rate), 70 | ) 71 | 72 | # create Trainer instance 73 | trainer = Trainer( 74 | model=model, 75 | args=training_args, 76 | compute_metrics=compute_metrics, 77 | train_dataset=train_dataset, 78 | eval_dataset=test_dataset, 79 | tokenizer=tokenizer, 80 | ) 81 | 82 | # train model 83 | trainer.train() 84 | 85 | # evaluate model 86 | eval_result = trainer.evaluate(eval_dataset=test_dataset) 87 | 88 | # writes eval result to file which can be accessed later in s3 ouput 89 | with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer: 90 | print(f"***** Eval results *****") 91 | for key, value in sorted(eval_result.items()): 92 | writer.write(f"{key} = {value}\n") 93 | 94 | # Saves the model to s3 95 | trainer.save_model(args.model_dir) 96 | -------------------------------------------------------------------------------- /sagemaker/06_sagemaker_metrics/scripts/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import random 5 | import sys 6 | 7 | from datasets import load_from_disk 8 | from sklearn.metrics import accuracy_score, precision_recall_fscore_support 9 | import torch 10 | from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments 11 | 12 | 13 | if __name__ == "__main__": 14 | 15 | parser = argparse.ArgumentParser() 16 | 17 | # hyperparameters sent by the client are passed as command-line arguments to the script. 18 | parser.add_argument("--epochs", type=int, default=3) 19 | parser.add_argument("--train_batch_size", type=int, default=32) 20 | parser.add_argument("--eval_batch_size", type=int, default=64) 21 | parser.add_argument("--warmup_steps", type=int, default=500) 22 | parser.add_argument("--model_name", type=str) 23 | parser.add_argument("--learning_rate", type=float, default=5e-5) 24 | 25 | # Data, model, and output directories 26 | parser.add_argument("--checkpoints", type=str, default="/opt/ml/checkpoints/") 27 | parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"]) 28 | parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"]) 29 | parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) 30 | parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"]) 31 | 32 | args, _ = parser.parse_known_args() 33 | 34 | # Set up logging 35 | logger = logging.getLogger(__name__) 36 | 37 | logging.basicConfig( 38 | level=logging.getLevelName("INFO"), 39 | handlers=[logging.StreamHandler(sys.stdout)], 40 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 41 | ) 42 | 43 | # load datasets 44 | train_dataset = load_from_disk(args.training_dir) 45 | test_dataset = load_from_disk(args.test_dir) 46 | 47 | logger.info(f" loaded train_dataset length is: {len(train_dataset)}") 48 | logger.info(f" loaded test_dataset length is: {len(test_dataset)}") 49 | 50 | # compute metrics function for binary classification 51 | def compute_metrics(pred): 52 | labels = pred.label_ids 53 | preds = pred.predictions.argmax(-1) 54 | precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary") 55 | acc = accuracy_score(labels, preds) 56 | return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall} 57 | 58 | # download model from model hub 59 | model = AutoModelForSequenceClassification.from_pretrained(args.model_name) 60 | tokenizer = AutoTokenizer.from_pretrained(args.model_name) 61 | 62 | # define training args 63 | training_args = TrainingArguments( 64 | output_dir=args.checkpoints, 65 | num_train_epochs=args.epochs, 66 | per_device_train_batch_size=args.train_batch_size, 67 | per_device_eval_batch_size=args.eval_batch_size, 68 | warmup_steps=args.warmup_steps, 69 | evaluation_strategy="epoch", 70 | logging_dir=f"{args.checkpoints}/logs", 71 | learning_rate=args.learning_rate, 72 | ) 73 | 74 | # create Trainer instance 75 | trainer = Trainer( 76 | model=model, 77 | args=training_args, 78 | compute_metrics=compute_metrics, 79 | train_dataset=train_dataset, 80 | eval_dataset=test_dataset, 81 | tokenizer=tokenizer, 82 | ) 83 | 84 | # train model 85 | trainer.train() 86 | 87 | # evaluate model 88 | eval_result = trainer.evaluate(eval_dataset=test_dataset) 89 | 90 | # writes eval result to file which can be accessed later in s3 ouput 91 | with open(os.path.join(args.checkpoints, "eval_results.txt"), "w") as writer: 92 | print(f"***** Eval results *****") 93 | for key, value in sorted(eval_result.items()): 94 | writer.write(f"{key} = {value}\n") 95 | 96 | # Saves the model locally. In SageMaker, writing in /opt/ml/model sends it to S3 97 | trainer.save_model(args.model_dir) 98 | -------------------------------------------------------------------------------- /course/videos/bleu_metric.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from datasets import load_metric\n", 61 | "\n", 62 | "bleu = load_metric(\"bleu\")\n", 63 | "predictions = [[\"I\", \"have\", \"thirty\", \"six\", \"years\"]]\n", 64 | "references = [\n", 65 | " [[\"I\", \"am\", \"thirty\", \"six\", \"years\", \"old\"], [\"I\", \"am\", \"thirty\", \"six\"]]\n", 66 | "]\n", 67 | "bleu.compute(predictions=predictions, references=references)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "predictions = [[\"I\", \"have\", \"thirty\", \"six\", \"years\"]]\n", 77 | "references = [\n", 78 | " [[\"I\", \"am\", \"thirty\", \"six\", \"years\", \"old\"], [\"I\", \"am\", \"thirty\", \"six\"]]\n", 79 | "]\n", 80 | "bleu.compute(predictions=predictions, references=references)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "predictions = [[\"I\", \"have\", \"thirty\", \"six\", \"years\"]]\n", 90 | "references = [\n", 91 | " [[\"I\", \"am\", \"thirty\", \"six\", \"years\", \"old\"], [\"I\", \"am\", \"thirty\", \"six\"]]\n", 92 | "]\n", 93 | "bleu.compute(predictions=predictions, references=references)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "! pip install sacrebleu" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "sacrebleu = load_metric(\"sacrebleu\")\n", 112 | "# SacreBLEU operates on raw text, not tokens\n", 113 | "predictions = [\"I have thirty six years\"]\n", 114 | "references = [[\"I am thirty six years old\", \"I am thirty six\"]]\n", 115 | "sacrebleu.compute(predictions=predictions, references=references)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [] 124 | } 125 | ], 126 | "metadata": { 127 | "colab": { 128 | "name": "What is the BLEU metric?", 129 | "provenance": [] 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 4 134 | } 135 | -------------------------------------------------------------------------------- /course/videos/datasets_and_dataframes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from datasets import load_dataset\n", 61 | "\n", 62 | "dataset = load_dataset(\"swiss_judgment_prediction\", \"all_languages\", split=\"train\")\n", 63 | "dataset[0]" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# Convert the output format to pandas.DataFrame\n", 73 | "dataset.set_format(\"pandas\")\n", 74 | "dataset[0]" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "dataset.__getitem__(0)\n", 84 | "\n", 85 | "dataset.set_format(\"pandas\")\n", 86 | "\n", 87 | "dataset.__getitem__(0)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "df = dataset.to_pandas()\n", 97 | "df.head()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "# How are languages distributed across regions?\n", 107 | "df.groupby(\"region\")[\"language\"].value_counts()\n", 108 | "\n", 109 | "# Which legal area is most common?\n", 110 | "df[\"legal area\"].value_counts()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "from transformers import AutoTokenizer\n", 120 | "\n", 121 | "# Load a pretrained tokenizer\n", 122 | "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n", 123 | "# Tokenize the `text` column\n", 124 | "dataset.map(lambda x : tokenizer(x[\"text\"]))" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "# Reset back to Arrow format\n", 134 | "dataset.reset_format()\n", 135 | "# Now we can tokenize!\n", 136 | "dataset.map(lambda x : tokenizer(x[\"text\"]))" 137 | ] 138 | } 139 | ], 140 | "metadata": { 141 | "colab": { 142 | "name": "Datasets + DataFrames = ❤️", 143 | "provenance": [] 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 4 148 | } 149 | -------------------------------------------------------------------------------- /course/videos/fast_tokenizers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from datasets import load_dataset\n", 61 | "\n", 62 | "raw_datasets = load_dataset(\"glue\", \"mnli\")\n", 63 | "raw_datasets" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "from transformers import AutoTokenizer\n", 73 | "\n", 74 | "fast_tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n", 75 | "\n", 76 | "def tokenize_with_fast(examples):\n", 77 | " return fast_tokenizer(\n", 78 | " examples[\"premise\"], examples[\"hypothesis\"], truncation=True\n", 79 | " )" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "slow_tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\", use_fast=False)\n", 89 | "\n", 90 | "def tokenize_with_slow(examples):\n", 91 | " return fast_tokenizer(\n", 92 | " examples[\"premise\"], examples[\"hypothesis\"], truncation=True\n", 93 | " )" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "%time tokenized_datasets = raw_datasets.map(tokenize_with_fast)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "%time tokenized_datasets = raw_datasets.map(tokenize_with_slow)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "%time tokenized_datasets = raw_datasets.map(tokenize_with_fast, batched=True)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "%time tokenized_datasets = raw_datasets.map(tokenize_with_slow, batched=True)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [] 138 | } 139 | ], 140 | "metadata": { 141 | "colab": { 142 | "name": "Why are fast tokenizers called fast?", 143 | "provenance": [] 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 4 148 | } 149 | -------------------------------------------------------------------------------- /sagemaker/05_spot_instances/scripts/train.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments 2 | from transformers.trainer_utils import get_last_checkpoint 3 | 4 | from sklearn.metrics import accuracy_score, precision_recall_fscore_support 5 | from datasets import load_from_disk 6 | import logging 7 | import sys 8 | import argparse 9 | import os 10 | 11 | # Set up logging 12 | logger = logging.getLogger(__name__) 13 | 14 | logging.basicConfig( 15 | level=logging.getLevelName("INFO"), 16 | handlers=[logging.StreamHandler(sys.stdout)], 17 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 18 | ) 19 | 20 | if __name__ == "__main__": 21 | 22 | logger.info(sys.argv) 23 | 24 | parser = argparse.ArgumentParser() 25 | 26 | # hyperparameters sent by the client are passed as command-line arguments to the script. 27 | parser.add_argument("--epochs", type=int, default=3) 28 | parser.add_argument("--train_batch_size", type=int, default=32) 29 | parser.add_argument("--eval_batch_size", type=int, default=64) 30 | parser.add_argument("--warmup_steps", type=int, default=500) 31 | parser.add_argument("--model_name", type=str) 32 | parser.add_argument("--learning_rate", type=str, default=5e-5) 33 | parser.add_argument("--output_dir", type=str) 34 | 35 | # Data, model, and output directories 36 | parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) 37 | parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"]) 38 | parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"]) 39 | parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) 40 | parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"]) 41 | 42 | args, _ = parser.parse_known_args() 43 | 44 | # load datasets 45 | train_dataset = load_from_disk(args.training_dir) 46 | test_dataset = load_from_disk(args.test_dir) 47 | 48 | logger.info(f" loaded train_dataset length is: {len(train_dataset)}") 49 | logger.info(f" loaded test_dataset length is: {len(test_dataset)}") 50 | 51 | # compute metrics function for binary classification 52 | def compute_metrics(pred): 53 | labels = pred.label_ids 54 | preds = pred.predictions.argmax(-1) 55 | precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary") 56 | acc = accuracy_score(labels, preds) 57 | return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall} 58 | 59 | # download model from model hub 60 | model = AutoModelForSequenceClassification.from_pretrained(args.model_name) 61 | tokenizer = AutoTokenizer.from_pretrained(args.model_name) 62 | 63 | # define training args 64 | training_args = TrainingArguments( 65 | output_dir=args.output_dir, 66 | num_train_epochs=args.epochs, 67 | per_device_train_batch_size=args.train_batch_size, 68 | per_device_eval_batch_size=args.eval_batch_size, 69 | warmup_steps=args.warmup_steps, 70 | evaluation_strategy="epoch", 71 | logging_dir=f"{args.output_data_dir}/logs", 72 | learning_rate=float(args.learning_rate), 73 | ) 74 | 75 | # create Trainer instance 76 | trainer = Trainer( 77 | model=model, 78 | args=training_args, 79 | compute_metrics=compute_metrics, 80 | train_dataset=train_dataset, 81 | eval_dataset=test_dataset, 82 | tokenizer=tokenizer, 83 | ) 84 | 85 | # train model 86 | if get_last_checkpoint(args.output_dir) is not None: 87 | logger.info("***** continue training *****") 88 | last_checkpoint = get_last_checkpoint(args.output_dir) 89 | trainer.train(resume_from_checkpoint=last_checkpoint) 90 | else: 91 | trainer.train() 92 | # evaluate model 93 | eval_result = trainer.evaluate(eval_dataset=test_dataset) 94 | 95 | # writes eval result to file which can be accessed later in s3 ouput 96 | with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer: 97 | print(f"***** Eval results *****") 98 | for key, value in sorted(eval_result.items()): 99 | writer.write(f"{key} = {value}\n") 100 | 101 | # Saves the model to s3 102 | trainer.save_model(args.model_dir) 103 | -------------------------------------------------------------------------------- /course/videos/debug_error.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from transformers import pipeline\n", 61 | "\n", 62 | "model_checkpoint = \"distillbert-base-cased-distilled-squad\"\n", 63 | "question_answerer = pipeline(\"question_answering\", model=model_checkpoint)\n", 64 | "\n", 65 | "context = \"\"\"\n", 66 | "🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.\n", 67 | "\"\"\"\n", 68 | "question = \"Which deep learning libraries back 🤗 Transformers?\"\n", 69 | "question_answerer(question=question, context=context)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "from transformers import pipeline\n", 79 | "\n", 80 | "model_checkpoint = \"distillbert-base-cased-distilled-squad\"\n", 81 | "question_answerer = pipeline(\"question-answering\", model=model_checkpoint)\n", 82 | "\n", 83 | "context = \"\"\"\n", 84 | "🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.\n", 85 | "\"\"\"\n", 86 | "question = \"Which deep learning libraries back 🤗 Transformers?\"\n", 87 | "question_answerer(question=question, context=context)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "from transformers import pipeline\n", 97 | "\n", 98 | "model_checkpoint = \"distilbert-base-cased-distilled-squad\"\n", 99 | "question_answerer = pipeline(\"question-answering\", model=model_checkpoint)\n", 100 | "\n", 101 | "context = \"\"\"\n", 102 | "🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.\n", 103 | "\"\"\"\n", 104 | "question = \"Which deep learning libraries back 🤗 Transformers?\"\n", 105 | "question_answerer(question=question, context=context)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [] 114 | } 115 | ], 116 | "metadata": { 117 | "colab": { 118 | "name": "What to do when you get an error?", 119 | "provenance": [] 120 | } 121 | }, 122 | "nbformat": 4, 123 | "nbformat_minor": 4 124 | } 125 | -------------------------------------------------------------------------------- /course/videos/summarization_processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from datasets import load_dataset, load_metric\n", 61 | "\n", 62 | "raw_datasets = load_dataset(\"xsum\")\n", 63 | "raw_datasets = raw_datasets.remove_columns([\"id\"])\n", 64 | "raw_datasets[\"train\"]" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "print(raw_datasets[\"train\"][1])" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "from transformers import AutoTokenizer\n", 83 | "\n", 84 | "model_checkpoint = \"t5-small\"\n", 85 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", 86 | "\n", 87 | "sample = raw_datasets[\"train\"][1]\n", 88 | "inputs = tokenizer(sample[\"document\"])\n", 89 | "with tokenizer.as_target_tokenizer():\n", 90 | " targets = tokenizer(sample[\"summary\"])\n", 91 | "\n", 92 | "print(tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"]))\n", 93 | "print(tokenizer.convert_ids_to_tokens(targets[\"input_ids\"])" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "max_input_length = 1024\n", 103 | "max_target_length = 128\n", 104 | "\n", 105 | "def preprocess_function(examples):\n", 106 | " model_inputs = tokenizer(examples[\"document\"], max_length=max_input_length, truncation=True)\n", 107 | "\n", 108 | " # Setup the tokenizer for targets\n", 109 | " with tokenizer.as_target_tokenizer():\n", 110 | " labels = tokenizer(examples[\"summary\"], max_length=max_target_length, truncation=True)\n", 111 | "\n", 112 | " model_inputs[\"labels\"] = labels[\"input_ids\"]\n", 113 | " return model_inputs\n", 114 | "\n", 115 | "tokenized_datasets = raw_datasets.map(\n", 116 | " preprocess_function, batched=True, remove_columns=[\"document\", \"summary\"]\n", 117 | ")" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "from transformers import DataCollatorForSeq2Seq\n", 127 | "\n", 128 | "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [] 137 | } 138 | ], 139 | "metadata": { 140 | "colab": { 141 | "name": "Data processing for Summarization", 142 | "provenance": [] 143 | } 144 | }, 145 | "nbformat": 4, 146 | "nbformat_minor": 4 147 | } 148 | -------------------------------------------------------------------------------- /course/chapter2/section4_pt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tokenizers (PyTorch)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "['Jim', 'Henson', 'was', 'a', 'puppeteer']" 35 | ] 36 | }, 37 | "execution_count": null, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "tokenized_text = \"Jim Henson was a puppeteer\".split()\n", 44 | "print(tokenized_text)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "from transformers import BertTokenizer\n", 54 | "\n", 55 | "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "from transformers import AutoTokenizer\n", 65 | "\n", 66 | "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "{'input_ids': [101, 7993, 170, 11303, 1200, 2443, 1110, 3014, 102],\n", 78 | " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],\n", 79 | " 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}" 80 | ] 81 | }, 82 | "execution_count": null, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "tokenizer(\"Using a Transformer network is simple\")" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "tokenizer.save_pretrained(\"directory_on_my_computer\")" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "['Using', 'a', 'transform', '##er', 'network', 'is', 'simple']" 109 | ] 110 | }, 111 | "execution_count": null, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "from transformers import AutoTokenizer\n", 118 | "\n", 119 | "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n", 120 | "\n", 121 | "sequence = \"Using a Transformer network is simple\"\n", 122 | "tokens = tokenizer.tokenize(sequence)\n", 123 | "\n", 124 | "print(tokens)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "data": { 134 | "text/plain": [ 135 | "[7993, 170, 11303, 1200, 2443, 1110, 3014]" 136 | ] 137 | }, 138 | "execution_count": null, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "ids = tokenizer.convert_tokens_to_ids(tokens)\n", 145 | "\n", 146 | "print(ids)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "data": { 156 | "text/plain": [ 157 | "'Using a Transformer network is simple'" 158 | ] 159 | }, 160 | "execution_count": null, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])\n", 167 | "print(decoded_string)" 168 | ] 169 | } 170 | ], 171 | "metadata": { 172 | "colab": { 173 | "name": "Tokenizers (PyTorch)", 174 | "provenance": [] 175 | } 176 | }, 177 | "nbformat": 4, 178 | "nbformat_minor": 4 179 | } 180 | -------------------------------------------------------------------------------- /course/chapter2/section4_tf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tokenizers (TensorFlow)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "['Jim', 'Henson', 'was', 'a', 'puppeteer']" 35 | ] 36 | }, 37 | "execution_count": null, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "tokenized_text = \"Jim Henson was a puppeteer\".split()\n", 44 | "print(tokenized_text)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "from transformers import BertTokenizer\n", 54 | "\n", 55 | "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "from transformers import AutoTokenizer\n", 65 | "\n", 66 | "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "{'input_ids': [101, 7993, 170, 11303, 1200, 2443, 1110, 3014, 102],\n", 78 | " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],\n", 79 | " 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}" 80 | ] 81 | }, 82 | "execution_count": null, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "tokenizer(\"Using a Transformer network is simple\")" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "tokenizer.save_pretrained(\"directory_on_my_computer\")" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "['Using', 'a', 'transform', '##er', 'network', 'is', 'simple']" 109 | ] 110 | }, 111 | "execution_count": null, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "from transformers import AutoTokenizer\n", 118 | "\n", 119 | "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n", 120 | "\n", 121 | "sequence = \"Using a Transformer network is simple\"\n", 122 | "tokens = tokenizer.tokenize(sequence)\n", 123 | "\n", 124 | "print(tokens)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "data": { 134 | "text/plain": [ 135 | "[7993, 170, 11303, 1200, 2443, 1110, 3014]" 136 | ] 137 | }, 138 | "execution_count": null, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "ids = tokenizer.convert_tokens_to_ids(tokens)\n", 145 | "\n", 146 | "print(ids)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "data": { 156 | "text/plain": [ 157 | "'Using a Transformer network is simple'" 158 | ] 159 | }, 160 | "execution_count": null, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])\n", 167 | "print(decoded_string)" 168 | ] 169 | } 170 | ], 171 | "metadata": { 172 | "colab": { 173 | "name": "Tokenizers (TensorFlow)", 174 | "provenance": [] 175 | } 176 | }, 177 | "nbformat": 4, 178 | "nbformat_minor": 4 179 | } 180 | -------------------------------------------------------------------------------- /course/videos/clm_processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from transformers import AutoTokenizer, AutoModelForCausalLM\n", 61 | "from datasets import load_dataset, DatasetDict\n", 62 | "\n", 63 | "ds_train = load_dataset(\"huggingface-course/codeparrot-ds-train\", split=\"train\")\n", 64 | "ds_valid = load_dataset(\"huggingface-course/codeparrot-ds-valid\", split=\"train\")\n", 65 | "\n", 66 | "raw_datasets = DatasetDict(\n", 67 | " {\n", 68 | " \"train\": ds_train,\n", 69 | " \"valid\": ds_valid,\n", 70 | " }\n", 71 | ")\n", 72 | "\n", 73 | "tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/code-search-net-tokenizer\")\n", 74 | "model = AutoModelForCausalLM.from_pretrained(\"huggingface-course/codeparrot-ds\")\n", 75 | "batch = tokenizer([\"import numpy as np\"], return_tensors=\"pt\")\n", 76 | "\n", 77 | "text = \"import numpy as np\\n\"*20\n", 78 | "context_length = 128" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "outputs = tokenizer(\n", 88 | " text,\n", 89 | " truncation=True,\n", 90 | " max_length=16,\n", 91 | " return_overflowing_tokens=True,\n", 92 | " return_length=True,\n", 93 | " )\n", 94 | "\n", 95 | "print(f\"Input chunk lengths: {(outputs['length'])}\")" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "def tokenize(element):\n", 105 | " outputs = tokenizer(\n", 106 | " element[\"content\"],\n", 107 | " truncation=True,\n", 108 | " max_length=context_length,\n", 109 | " return_overflowing_tokens=True,\n", 110 | " return_length=True,\n", 111 | " )\n", 112 | " input_batch = []\n", 113 | " for length, input_ids in zip(outputs[\"length\"], outputs[\"input_ids\"]):\n", 114 | " if length == context_length:\n", 115 | " input_batch.append(input_ids)\n", 116 | " return {\"input_ids\": input_batch}\n", 117 | "\n", 118 | "\n", 119 | "tokenized_datasets = raw_datasets.map(\n", 120 | " tokenize, batched=True, remove_columns=raw_datasets[\"train\"].column_names\n", 121 | ")" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "output = model(input_ids=batch[\"input_ids\"], labels=batch[\"input_ids\"])\n", 131 | "loss = output.loss" 132 | ] 133 | } 134 | ], 135 | "metadata": { 136 | "colab": { 137 | "name": "Data processing for Causal Language Modeling", 138 | "provenance": [] 139 | } 140 | }, 141 | "nbformat": 4, 142 | "nbformat_minor": 4 143 | } 144 | -------------------------------------------------------------------------------- /course/videos/load_custom_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "from datasets import load_dataset\n", 70 | "\n", 71 | "local_csv_dataset = load_dataset(\"csv\", data_files=\"winequality-white.csv\", sep=\";\")\n", 72 | "local_csv_dataset[\"train\"]" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "# Load the dataset from the URL directly\n", 82 | "dataset_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\"\n", 83 | "remote_csv_dataset = load_dataset(\"csv\", data_files=dataset_url, sep=\";\")\n", 84 | "remote_csv_dataset" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "dataset_url = \"https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt\"\n", 94 | "text_dataset = load_dataset(\"text\", data_files=dataset_url)\n", 95 | "text_dataset[\"train\"][:5]" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "dataset_url = \"https://raw.githubusercontent.com/hirupert/sede/main/data/sede/train.jsonl\"\n", 105 | "json_lines_dataset = load_dataset(\"json\", data_files=dataset_url)\n", 106 | "json_lines_dataset[\"train\"][:2]" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "dataset_url = \"https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json\"\n", 116 | "json_dataset = load_dataset(\"json\", data_files=dataset_url, field=\"data\")\n", 117 | "json_dataset" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "url = \"https://rajpurkar.github.io/SQuAD-explorer/dataset/\"\n", 127 | "data_files = {\"train\": f\"{url}train-v2.0.json\", \"validation\": f\"{url}dev-v2.0.json\"}\n", 128 | "json_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")\n", 129 | "json_dataset" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [] 138 | } 139 | ], 140 | "metadata": { 141 | "colab": { 142 | "name": "Loading a custom dataset", 143 | "provenance": [] 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 4 148 | } 149 | -------------------------------------------------------------------------------- /course/videos/debug_training_tf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from datasets import load_dataset, load_metric\n", 61 | "from transformers import (\n", 62 | " AutoTokenizer,\n", 63 | " TFAutoModelForSequenceClassification,\n", 64 | ")\n", 65 | "\n", 66 | "raw_datasets = load_dataset(\"glue\", \"mnli\")\n", 67 | "\n", 68 | "model_checkpoint = \"distilbert-base-uncased\"\n", 69 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", 70 | "\n", 71 | "\n", 72 | "def preprocess_function(examples):\n", 73 | " return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True)\n", 74 | "\n", 75 | "\n", 76 | "tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)\n", 77 | "model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "train_dataset = tokenized_datasets[\"train\"].to_tf_dataset(\n", 87 | " columns=[\"input_ids\", \"labels\"], batch_size=16, shuffle=True\n", 88 | ")\n", 89 | "\n", 90 | "validation_dataset = tokenized_datasets[\"validation_matched\"].to_tf_dataset(\n", 91 | " columns=[\"input_ids\", \"labels\"], batch_size=16, shuffle=True\n", 92 | ")\n", 93 | "\n", 94 | "model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)\n", 95 | "\n", 96 | "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer='adam')\n", 97 | "\n", 98 | "model.fit(train_dataset)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "for batch in train_dataset:\n", 108 | " break" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "model.compile(optimizer='adam')" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "model = TFAutoModelForSequenceClassification.from_pretrained(\n", 127 | " model_checkpoint,\n", 128 | " num_labels=3\n", 129 | ")" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "model.compile(optimizer='adam')" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [] 147 | } 148 | ], 149 | "metadata": { 150 | "colab": { 151 | "name": "Debugging the Training Pipeline (TensorFlow)", 152 | "provenance": [] 153 | } 154 | }, 155 | "nbformat": 4, 156 | "nbformat_minor": 4 157 | } 158 | -------------------------------------------------------------------------------- /course/chapter8/section3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Asking for help on the forums" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from transformers import AutoTokenizer, AutoModel\n", 33 | "\n", 34 | "model_checkpoint = \"distilbert-base-uncased\"\n", 35 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", 36 | "model = AutoModel.from_pretrained(model_checkpoint)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "text = \"\"\"\n", 46 | "Generation One is a retroactive term for the Transformers characters that\n", 47 | "appeared between 1984 and 1993. The Transformers began with the 1980s Japanese\n", 48 | "toy lines Micro Change and Diaclone. They presented robots able to transform\n", 49 | "into everyday vehicles, electronic items or weapons. Hasbro bought the Micro\n", 50 | "Change and Diaclone toys, and partnered with Takara. Marvel Comics was hired by\n", 51 | "Hasbro to create the backstory; editor-in-chief Jim Shooter wrote an overall\n", 52 | "story, and gave the task of creating the characthers to writer Dennis O'Neil.\n", 53 | "Unhappy with O'Neil's work (although O'Neil created the name \"Optimus Prime\"),\n", 54 | "Shooter chose Bob Budiansky to create the characters.\n", 55 | "\n", 56 | "The Transformers mecha were largely designed by Shōji Kawamori, the creator of\n", 57 | "the Japanese mecha anime franchise Macross (which was adapted into the Robotech\n", 58 | "franchise in North America). Kawamori came up with the idea of transforming\n", 59 | "mechs while working on the Diaclone and Macross franchises in the early 1980s\n", 60 | "(such as the VF-1 Valkyrie in Macross and Robotech), with his Diaclone mechs\n", 61 | "later providing the basis for Transformers.\n", 62 | "\n", 63 | "The primary concept of Generation One is that the heroic Optimus Prime, the\n", 64 | "villainous Megatron, and their finest soldiers crash land on pre-historic Earth\n", 65 | "in the Ark and the Nemesis before awakening in 1985, Cybertron hurtling through\n", 66 | "the Neutral zone as an effect of the war. The Marvel comic was originally part\n", 67 | "of the main Marvel Universe, with appearances from Spider-Man and Nick Fury,\n", 68 | "plus some cameos, as well as a visit to the Savage Land.\n", 69 | "\n", 70 | "The Transformers TV series began around the same time. Produced by Sunbow\n", 71 | "Productions and Marvel Productions, later Hasbro Productions, from the start it\n", 72 | "contradicted Budiansky's backstories. The TV series shows the Autobots looking\n", 73 | "for new energy sources, and crash landing as the Decepticons attack. Marvel\n", 74 | "interpreted the Autobots as destroying a rogue asteroid approaching Cybertron.\n", 75 | "Shockwave is loyal to Megatron in the TV series, keeping Cybertron in a\n", 76 | "stalemate during his absence, but in the comic book he attempts to take command\n", 77 | "of the Decepticons. The TV series would also differ wildly from the origins\n", 78 | "Budiansky had created for the Dinobots, the Decepticon turned Autobot Jetfire\n", 79 | "(known as Skyfire on TV), the Constructicons (who combine to form\n", 80 | "Devastator),[19][20] and Omega Supreme. The Marvel comic establishes early on\n", 81 | "that Prime wields the Creation Matrix, which gives life to machines. In the\n", 82 | "second season, the two-part episode The Key to Vector Sigma introduced the\n", 83 | "ancient Vector Sigma computer, which served the same original purpose as the\n", 84 | "Creation Matrix (giving life to Transformers), and its guardian Alpha Trion.\n", 85 | "\"\"\"\n", 86 | "\n", 87 | "inputs = tokenizer(text, return_tensors=\"pt\")\n", 88 | "logits = model(**inputs).logits" 89 | ] 90 | } 91 | ], 92 | "metadata": { 93 | "colab": { 94 | "name": "Asking for help on the forums", 95 | "provenance": [] 96 | } 97 | }, 98 | "nbformat": 4, 99 | "nbformat_minor": 4 100 | } 101 | -------------------------------------------------------------------------------- /sagemaker/02_getting_started_tensorflow/scripts/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import sys 5 | 6 | import tensorflow as tf 7 | from datasets import load_dataset 8 | from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding, create_optimizer 9 | 10 | 11 | if __name__ == "__main__": 12 | 13 | parser = argparse.ArgumentParser() 14 | 15 | # Hyperparameters sent by the client are passed as command-line arguments to the script. 16 | parser.add_argument("--epochs", type=int, default=3) 17 | parser.add_argument("--train_batch_size", type=int, default=16) 18 | parser.add_argument("--eval_batch_size", type=int, default=8) 19 | parser.add_argument("--model_id", type=str) 20 | parser.add_argument("--learning_rate", type=str, default=3e-5) 21 | 22 | # Data, model, and output directories 23 | parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) 24 | parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"]) 25 | parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"]) 26 | 27 | args, _ = parser.parse_known_args() 28 | 29 | # Set up logging 30 | logger = logging.getLogger(__name__) 31 | 32 | logging.basicConfig( 33 | level=logging.getLevelName("INFO"), 34 | handlers=[logging.StreamHandler(sys.stdout)], 35 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 36 | ) 37 | 38 | # Load tokenizer 39 | tokenizer = AutoTokenizer.from_pretrained(args.model_id) 40 | 41 | # Load DatasetDict 42 | dataset = load_dataset("imdb") 43 | 44 | # Preprocess train dataset 45 | def preprocess_function(examples): 46 | return tokenizer(examples["text"], truncation=True) 47 | 48 | encoded_dataset = dataset.map(preprocess_function, batched=True) 49 | 50 | # define tokenizer_columns 51 | # tokenizer_columns is the list of keys from the dataset that get passed to the TensorFlow model 52 | tokenizer_columns = ["attention_mask", "input_ids"] 53 | 54 | # convert to TF datasets 55 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") 56 | encoded_dataset["train"] = encoded_dataset["train"].rename_column("label", "labels") 57 | tf_train_dataset = encoded_dataset["train"].to_tf_dataset( 58 | columns=tokenizer_columns, 59 | label_cols=["labels"], 60 | shuffle=True, 61 | batch_size=8, 62 | collate_fn=data_collator, 63 | ) 64 | encoded_dataset["test"] = encoded_dataset["test"].rename_column("label", "labels") 65 | tf_validation_dataset = encoded_dataset["test"].to_tf_dataset( 66 | columns=tokenizer_columns, 67 | label_cols=["labels"], 68 | shuffle=False, 69 | batch_size=8, 70 | collate_fn=data_collator, 71 | ) 72 | 73 | # Prepare model labels - useful in inference API 74 | labels = encoded_dataset["train"].features["labels"].names 75 | num_labels = len(labels) 76 | label2id, id2label = dict(), dict() 77 | for i, label in enumerate(labels): 78 | label2id[label] = str(i) 79 | id2label[str(i)] = label 80 | 81 | # download model from model hub 82 | model = TFAutoModelForSequenceClassification.from_pretrained( 83 | args.model_id, num_labels=num_labels, label2id=label2id, id2label=id2label 84 | ) 85 | 86 | # create Adam optimizer with learning rate scheduling 87 | batches_per_epoch = len(encoded_dataset["train"]) // args.train_batch_size 88 | total_train_steps = int(batches_per_epoch * args.epochs) 89 | 90 | optimizer, _ = create_optimizer(init_lr=args.learning_rate, num_warmup_steps=0, num_train_steps=total_train_steps) 91 | loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) 92 | 93 | # define metric and compile model 94 | metrics = [tf.keras.metrics.SparseCategoricalAccuracy()] 95 | model.compile(optimizer=optimizer, loss=loss, metrics=metrics) 96 | 97 | # Training 98 | logger.info("*** Train ***") 99 | train_results = model.fit( 100 | tf_train_dataset, 101 | epochs=args.epochs, 102 | validation_data=tf_validation_dataset, 103 | ) 104 | 105 | output_eval_file = os.path.join(args.output_data_dir, "train_results.txt") 106 | 107 | with open(output_eval_file, "w") as writer: 108 | logger.info("***** Train results *****") 109 | logger.info(train_results) 110 | for key, value in train_results.history.items(): 111 | logger.info(" %s = %s", key, value) 112 | writer.write("%s = %s\n" % (key, value)) 113 | 114 | # Save result 115 | model.save_pretrained(args.model_dir) 116 | tokenizer.save_pretrained(args.model_dir) 117 | -------------------------------------------------------------------------------- /course/videos/save_load_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from datasets import load_dataset\n", 61 | "\n", 62 | "raw_datasets = load_dataset(\"allocine\")\n", 63 | "raw_datasets.cache_files" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "raw_datasets.save_to_disk(\"my-arrow-datasets\")" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "from datasets import load_from_disk\n", 82 | "\n", 83 | "arrow_datasets_reloaded = load_from_disk(\"my-arrow-datasets\")\n", 84 | "arrow_datasets_reloaded" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "for split, dataset in raw_datasets.items():\n", 94 | " dataset.to_csv(f\"my-dataset-{split}.csv\", index=None)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "data_files = {\n", 104 | " \"train\": \"my-dataset-train.csv\",\n", 105 | " \"validation\": \"my-dataset-validation.csv\",\n", 106 | " \"test\": \"my-dataset-test.csv\",\n", 107 | "}\n", 108 | "\n", 109 | "csv_datasets_reloaded = load_dataset(\"csv\", data_files=data_files)\n", 110 | "csv_datasets_reloaded" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "# Save in JSON Lines format\n", 120 | "for split, dataset in raw_datasets.items():\n", 121 | " dataset.to_json(f\"my-dataset-{split}.jsonl\")\n", 122 | "\n", 123 | "# Save in Parquet format\n", 124 | "for split, dataset in raw_datasets.items():\n", 125 | " dataset.to_parquet(f\"my-dataset-{split}.parquet\")" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "json_data_files = {\n", 135 | " \"train\": \"my-dataset-train.jsonl\",\n", 136 | " \"validation\": \"my-dataset-validation.jsonl\",\n", 137 | " \"test\": \"my-dataset-test.jsonl\",\n", 138 | "}\n", 139 | "\n", 140 | "parquet_data_files = {\n", 141 | " \"train\": \"my-dataset-train.parquet\",\n", 142 | " \"validation\": \"my-dataset-validation.parquet\",\n", 143 | " \"test\": \"my-dataset-test.parquet\",\n", 144 | "}\n", 145 | "\n", 146 | "# Reload with the `json` script\n", 147 | "json_datasets_reloaded = load_dataset(\"json\", data_files=json_data_files)\n", 148 | "# Reload with the `parquet` script\n", 149 | "parquet_datasets_reloaded = load_dataset(\"parquet\", data_files=parquet_data_files)" 150 | ] 151 | } 152 | ], 153 | "metadata": { 154 | "colab": { 155 | "name": "Saving and reloading a dataset", 156 | "provenance": [] 157 | } 158 | }, 159 | "nbformat": 4, 160 | "nbformat_minor": 4 161 | } 162 | -------------------------------------------------------------------------------- /sagemaker/15_training_compiler/scripts/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import random 5 | import sys 6 | 7 | import numpy as np 8 | import torch 9 | from datasets import load_from_disk, load_metric 10 | from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments 11 | from transformers.trainer_utils import get_last_checkpoint 12 | 13 | if __name__ == "__main__": 14 | 15 | parser = argparse.ArgumentParser() 16 | 17 | # hyperparameters sent by the client are passed as command-line arguments to the script. 18 | parser.add_argument("--epochs", type=int, default=3) 19 | parser.add_argument("--train_batch_size", type=int, default=32) 20 | parser.add_argument("--eval_batch_size", type=int, default=64) 21 | parser.add_argument("--warmup_steps", type=int, default=500) 22 | parser.add_argument("--model_id", type=str) 23 | parser.add_argument("--learning_rate", type=str, default=5e-5) 24 | parser.add_argument("--fp16", type=bool, default=True) 25 | 26 | # Data, model, and output directories 27 | parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) 28 | parser.add_argument("--output_dir", type=str, default=os.environ["SM_MODEL_DIR"]) 29 | parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"]) 30 | parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) 31 | parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"]) 32 | 33 | args, _ = parser.parse_known_args() 34 | 35 | # is needed for Amazon SageMaker Training Compiler 36 | os.environ["GPU_NUM_DEVICES"] = args.n_gpus 37 | 38 | # Set up logging 39 | logger = logging.getLogger(__name__) 40 | 41 | logging.basicConfig( 42 | level=logging.getLevelName("INFO"), 43 | handlers=[logging.StreamHandler(sys.stdout)], 44 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 45 | ) 46 | 47 | # load datasets 48 | train_dataset = load_from_disk(args.training_dir) 49 | test_dataset = load_from_disk(args.test_dir) 50 | 51 | logger.info(f" loaded train_dataset length is: {len(train_dataset)}") 52 | logger.info(f" loaded test_dataset length is: {len(test_dataset)}") 53 | 54 | # define metrics and metrics function 55 | metric = load_metric("accuracy") 56 | 57 | def compute_metrics(eval_pred): 58 | predictions, labels = eval_pred 59 | predictions = np.argmax(predictions, axis=1) 60 | return metric.compute(predictions=predictions, references=labels) 61 | 62 | # Prepare model labels - useful in inference API 63 | labels = train_dataset.features["labels"].names 64 | num_labels = len(labels) 65 | label2id, id2label = dict(), dict() 66 | for i, label in enumerate(labels): 67 | label2id[label] = str(i) 68 | id2label[str(i)] = label 69 | 70 | # download model from model hub 71 | model = AutoModelForSequenceClassification.from_pretrained( 72 | args.model_id, num_labels=num_labels, label2id=label2id, id2label=id2label 73 | ) 74 | tokenizer = AutoTokenizer.from_pretrained(args.model_id) 75 | 76 | # define training args 77 | training_args = TrainingArguments( 78 | output_dir=args.output_dir, 79 | overwrite_output_dir=True if get_last_checkpoint(args.output_dir) is not None else False, 80 | num_train_epochs=args.epochs, 81 | per_device_train_batch_size=args.train_batch_size, 82 | per_device_eval_batch_size=args.eval_batch_size, 83 | warmup_steps=args.warmup_steps, 84 | fp16=args.fp16, 85 | evaluation_strategy="epoch", 86 | save_strategy="epoch", 87 | save_total_limit=2, 88 | logging_dir=f"{args.output_data_dir}/logs", 89 | learning_rate=float(args.learning_rate), 90 | load_best_model_at_end=True, 91 | metric_for_best_model="accuracy", 92 | disable_tqdm=True, 93 | ) 94 | 95 | # create Trainer instance 96 | trainer = Trainer( 97 | model=model, 98 | args=training_args, 99 | compute_metrics=compute_metrics, 100 | train_dataset=train_dataset, 101 | eval_dataset=test_dataset, 102 | tokenizer=tokenizer, 103 | ) 104 | 105 | # train model 106 | trainer.train() 107 | 108 | # evaluate model 109 | eval_result = trainer.evaluate(eval_dataset=test_dataset) 110 | 111 | # writes eval result to file which can be accessed later in s3 ouput 112 | with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer: 113 | print(f"***** Eval results *****") 114 | for key, value in sorted(eval_result.items()): 115 | writer.write(f"{key} = {value}\n") 116 | print(f"{key} = {value}\n") 117 | 118 | # Saves the model to s3 uses os.environ["SM_MODEL_DIR"] to make sure checkpointing works 119 | trainer.save_model(os.environ["SM_MODEL_DIR"]) 120 | -------------------------------------------------------------------------------- /course/videos/mlm_processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from datasets import load_dataset\n", 61 | "\n", 62 | "raw_datasets = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\")\n", 63 | "raw_datasets[\"train\"]" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "from datasets import load_dataset\n", 73 | "from transformers import AutoTokenizer\n", 74 | "\n", 75 | "raw_datasets = load_dataset(\"imdb\")\n", 76 | "raw_datasets = raw_datasets.remove_columns(\"label\")\n", 77 | "\n", 78 | "model_checkpoint = \"distilbert-base-cased\"\n", 79 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", 80 | "context_length = 128\n", 81 | "\n", 82 | "def tokenize_pad_and_truncate(texts):\n", 83 | " return tokenizer(texts[\"text\"], truncation=True, padding=\"max_length\", max_length=context_length)\n", 84 | "\n", 85 | "tokenized_datasets = raw_datasets.map(tokenize_pad_and_truncate, batched=True)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "def tokenize_and_chunk(texts):\n", 95 | " return tokenizer(\n", 96 | " texts[\"text\"], truncation=True, max_length=context_length,\n", 97 | " return_overflowing_tokens=True\n", 98 | " )\n", 99 | "\n", 100 | "tokenized_datasets = raw_datasets.map(\n", 101 | " tokenize_and_chunk, batched=True, remove_columns=[\"text\"]\n", 102 | ")\n", 103 | "\n", 104 | "len(raw_datasets[\"train\"]), len(tokenized_datasets[\"train\"])" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "def tokenize_and_chunk(texts):\n", 114 | " all_input_ids = []\n", 115 | " for input_ids in tokenizer(texts[\"text\"])[\"input_ids\"]:\n", 116 | " all_input_ids.extend(input_ids)\n", 117 | " all_input_ids.append(tokenizer.eos_token_id)\n", 118 | " \n", 119 | " chunks = []\n", 120 | " for idx in range(0, len(all_input_ids), context_length):\n", 121 | " chunks.append(all_input_ids[idx: idx + context_length])\n", 122 | " return {\"input_ids\": chunks}\n", 123 | "\n", 124 | "tokenized_datasets = raw_datasets.map(tokenize_and_chunk, batched=True, remove_columns=[\"text\"])\n", 125 | "\n", 126 | "len(raw_datasets[\"train\"]), len(tokenized_datasets[\"train\"])" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "from transformers import DataCollatorForLanguageModeling\n", 136 | "\n", 137 | "data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [] 146 | } 147 | ], 148 | "metadata": { 149 | "colab": { 150 | "name": "Data processing for Masked Language Modeling", 151 | "provenance": [] 152 | } 153 | }, 154 | "nbformat": 4, 155 | "nbformat_minor": 4 156 | } 157 | -------------------------------------------------------------------------------- /course/videos/building_tokenizer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from datasets import load_dataset\n", 61 | "\n", 62 | "dataset = load_dataset(\"wikitext\", name=\"wikitext-2-raw-v1\", split=\"train\")\n", 63 | "\n", 64 | "\n", 65 | "def get_training_corpus():\n", 66 | " for i in range(0, len(dataset), 1000):\n", 67 | " yield dataset[i : i + 1000][\"text\"]" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "tokenizer = Tokenizer(models.WordPiece(unk_token=\"[UNK]\"))" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "tokenizer.normalizer = normalizers.Sequence(\n", 95 | " [\n", 96 | " normalizers.Replace(Regex(r\"[\\p{Other}&&[^\\n\\t\\r]]\"), \"\"),\n", 97 | " normalizers.Replace(Regex(r\"[\\s]\"), \" \"),\n", 98 | " normalizers.Lowercase(),\n", 99 | " normalizers.NFD(), normalizers.StripAccents()]\n", 100 | ")" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "special_tokens = [\"[UNK]\", \"[PAD]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"]\n", 119 | "trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "cls_token_id = tokenizer.token_to_id(\"[CLS]\")\n", 138 | "sep_token_id = tokenizer.token_to_id(\"[SEP]\")\n", 139 | "tokenizer.post_processor = processors.TemplateProcessing(\n", 140 | " single=f\"[CLS]:0 $A:0 [SEP]:0\",\n", 141 | " pair=f\"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1\",\n", 142 | " special_tokens=[(\"[CLS]\", cls_token_id), (\"[SEP]\", sep_token_id)],\n", 143 | ")" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "tokenizer.decoder = decoders.WordPiece(prefix=\"##\")" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [] 161 | } 162 | ], 163 | "metadata": { 164 | "colab": { 165 | "name": "Building a new tokenizer", 166 | "provenance": [] 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 4 171 | } 172 | -------------------------------------------------------------------------------- /course/videos/memory_mapping_streaming.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from datasets import load_dataset\n", 61 | "\n", 62 | "data_files = \"https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst\"\n", 63 | "large_dataset = load_dataset(\"json\", data_files=data_files, split=\"train\")\n", 64 | "size_gb = large_dataset.dataset_size / (1024 ** 3)\n", 65 | "print(f\"Dataset size (cache file) : {size_gb:.2f} GB\")" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "import psutil\n", 75 | "\n", 76 | "# Process.memory_info is expressed in bytes, so convert to megabytes\n", 77 | "print(f\"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB\")" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "import timeit\n", 87 | "\n", 88 | "code_snippet = \"\"\"batch_size = 1000\n", 89 | "\n", 90 | "for idx in range(0, len(large_dataset), batch_size):\n", 91 | " _ = large_dataset[idx:idx + batch_size]\n", 92 | "\"\"\"\n", 93 | "\n", 94 | "time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())\n", 95 | "print(\n", 96 | " f\"Iterated over {len(large_dataset)} examples (about {size_gb:.1f} GB) in \"\n", 97 | " f\"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s\"\n", 98 | ")" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "large_dataset_streamed = load_dataset(\n", 108 | " \"json\", data_files=data_files, split=\"train\", streaming=True)\n", 109 | "\n", 110 | "next(iter(large_dataset_streamed))" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "type(large_dataset_streamed)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "from transformers import AutoTokenizer\n", 129 | "\n", 130 | "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", 131 | "tokenized_dataset = large_dataset_streamed.map(lambda x: tokenizer(x[\"text\"]))\n", 132 | "next(iter(tokenized_dataset))" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "# Select the first 5 examples \n", 142 | "dataset_head = large_dataset_streamed.take(5)\n", 143 | "list(dataset_head)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# Skip the first 1,000 examples and include the rest in the training set\n", 153 | "train_dataset = large_dataset_streamed.skip(1000)\n", 154 | "# Take the first 1,000 examples for the validation set\n", 155 | "validation_dataset = large_dataset_streamed.take(1000)" 156 | ] 157 | } 158 | ], 159 | "metadata": { 160 | "colab": { 161 | "name": "Memory Mapping & streaming", 162 | "provenance": [] 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 4 167 | } 168 | -------------------------------------------------------------------------------- /course/videos/train_new_tokenizer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from transformers import BertTokenizerFast\n", 61 | "\n", 62 | "tokenizer = BertTokenizerFast.from_pretrained(\n", 63 | " 'huggingface-course/bert-base-uncased-tokenizer-without-normalizer'\n", 64 | ")" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "text = \"here is a sentence adapted to our tokenizer\"\n", 74 | "print(tokenizer.tokenize(text))" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "text = \"এই বাক্যটি আমাদের টোকেনাইজারের উপযুক্ত নয়\"\n", 84 | "print(tokenizer.tokenize(text))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "text = \"this tokenizer does not know àccënts and CAPITAL LETTERS\"\n", 94 | "print(tokenizer.tokenize(text))" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "text = \"the medical vocabulary is divided into many sub-token: paracetamol, phrayngitis\"\n", 104 | "print(tokenizer.tokenize(text))" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "from datasets import load_dataset\n", 114 | "\n", 115 | "raw_datasets = load_dataset(\"code_search_net\", \"python\")" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "def get_training_corpus():\n", 125 | " dataset = raw_datasets[\"train\"]\n", 126 | " for start_idx in range(0, len(dataset), 1000):\n", 127 | " samples = dataset[start_idx : start_idx + 1000]\n", 128 | " yield samples[\"whole_func_string\"]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "from transformers import AutoTokenizer\n", 138 | "\n", 139 | "old_tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", 140 | "training_corpus = get_training_corpus()\n", 141 | "new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)\n", 142 | "new_tokenizer.save_pretrained(\"code-search-net-tokenizer\")" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "example = \"\"\"class LinearLayer():\n", 152 | " def __init__(self, input_size, output_size):\n", 153 | " self.weight = torch.randn(input_size, output_size)\n", 154 | " self.bias = torch.zeros(output_size)\n", 155 | "\n", 156 | " def __call__(self, x):\n", 157 | " return x @ self.weights + self.bias\n", 158 | " \"\"\"\n", 159 | "\n", 160 | "print(old_tokenizer.tokenize(example))\n", 161 | "print(new_tokenizer.tokenize(example))" 162 | ] 163 | } 164 | ], 165 | "metadata": { 166 | "colab": { 167 | "name": "Training a new tokenizer", 168 | "provenance": [] 169 | } 170 | }, 171 | "nbformat": 4, 172 | "nbformat_minor": 4 173 | } 174 | -------------------------------------------------------------------------------- /course/chapter5/section2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# What if my dataset isn't on the Hub?" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz\n", 33 | "!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "!gzip -dkv SQuAD_it-*.json.gz" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "from datasets import load_dataset\n", 52 | "\n", 53 | "squad_it_dataset = load_dataset(\"json\", data_files=\"SQuAD_it-train.json\", field=\"data\")" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "DatasetDict({\n", 65 | " train: Dataset({\n", 66 | " features: ['title', 'paragraphs'],\n", 67 | " num_rows: 442\n", 68 | " })\n", 69 | "})" 70 | ] 71 | }, 72 | "execution_count": null, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "squad_it_dataset" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "{\n", 90 | " \"title\": \"Terremoto del Sichuan del 2008\",\n", 91 | " \"paragraphs\": [\n", 92 | " {\n", 93 | " \"context\": \"Il terremoto del Sichuan del 2008 o il terremoto...\",\n", 94 | " \"qas\": [\n", 95 | " {\n", 96 | " \"answers\": [{\"answer_start\": 29, \"text\": \"2008\"}],\n", 97 | " \"id\": \"56cdca7862d2951400fa6826\",\n", 98 | " \"question\": \"In quale anno si è verificato il terremoto nel Sichuan?\",\n", 99 | " },\n", 100 | " ...\n", 101 | " ],\n", 102 | " },\n", 103 | " ...\n", 104 | " ],\n", 105 | "}" 106 | ] 107 | }, 108 | "execution_count": null, 109 | "metadata": {}, 110 | "output_type": "execute_result" 111 | } 112 | ], 113 | "source": [ 114 | "squad_it_dataset[\"train\"][0]" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "DatasetDict({\n", 126 | " train: Dataset({\n", 127 | " features: ['title', 'paragraphs'],\n", 128 | " num_rows: 442\n", 129 | " })\n", 130 | " test: Dataset({\n", 131 | " features: ['title', 'paragraphs'],\n", 132 | " num_rows: 48\n", 133 | " })\n", 134 | "})" 135 | ] 136 | }, 137 | "execution_count": null, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "data_files = {\"train\": \"SQuAD_it-train.json\", \"test\": \"SQuAD_it-test.json\"}\n", 144 | "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")\n", 145 | "squad_it_dataset" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "data_files = {\"train\": \"SQuAD_it-train.json.gz\", \"test\": \"SQuAD_it-test.json.gz\"}\n", 155 | "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "url = \"https://github.com/crux82/squad-it/raw/master/\"\n", 165 | "data_files = {\n", 166 | " \"train\": url + \"SQuAD_it-train.json.gz\",\n", 167 | " \"test\": url + \"SQuAD_it-test.json.gz\",\n", 168 | "}\n", 169 | "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")" 170 | ] 171 | } 172 | ], 173 | "metadata": { 174 | "colab": { 175 | "name": "What if my dataset isn't on the Hub?", 176 | "provenance": [] 177 | } 178 | }, 179 | "nbformat": 4, 180 | "nbformat_minor": 4 181 | } 182 | -------------------------------------------------------------------------------- /course/videos/slice_and_dice.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from datasets import load_dataset\n", 61 | "\n", 62 | "squad = load_dataset(\"squad\", split=\"train\")\n", 63 | "squad[0]" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "squad_shuffled = squad.shuffle(seed=666)\n", 73 | "squad_shuffled[0]" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "dataset = squad.train_test_split(test_size=0.1)\n", 83 | "dataset" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "indices = [0, 10, 20, 40, 80]\n", 93 | "examples = squad.select(indices)\n", 94 | "examples" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "sample = squad.shuffle().select(range(5))\n", 104 | "sample" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "squad_filtered = squad.filter(lambda x : x[\"title\"].startswith(\"L\"))\n", 114 | "squad_filtered[0]" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "squad.rename_column(\"context\", \"passages\")" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "squad.remove_columns([\"id\", \"title\"])" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "squad" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "squad.flatten()" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "def lowercase_title(example):\n", 160 | " return {\"title\": example[\"title\"].lower()}\n", 161 | "\n", 162 | "squad_lowercase = squad.map(lowercase_title)\n", 163 | "# Peek at random sample\n", 164 | "squad_lowercase.shuffle(seed=42)[\"title\"][:5]" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "from transformers import AutoTokenizer\n", 174 | "\n", 175 | "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", 176 | "\n", 177 | "def tokenize_title(example):\n", 178 | " return tokenizer(example[\"title\"])\n", 179 | "\n", 180 | "squad.map(tokenize_title, batched=True, batch_size=500)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [] 189 | } 190 | ], 191 | "metadata": { 192 | "colab": { 193 | "name": "Slide and dice a dataset 🔪", 194 | "provenance": [] 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 4 199 | } 200 | -------------------------------------------------------------------------------- /course/videos/custom_loss.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from transformers import AutoTokenizer, AutoModelForCausalLM\n", 61 | "from accelerate import Accelerator\n", 62 | "\n", 63 | "accelerator = Accelerator()\n", 64 | "tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/code-search-net-tokenizer\")\n", 65 | "model = AutoModelForCausalLM.from_pretrained(\"huggingface-course/codeparrot-ds\")\n", 66 | "\n", 67 | "keytoken_ids = []\n", 68 | "for keyword in [\n", 69 | " \"plt\",\n", 70 | " \"pd\",\n", 71 | " \"sk\",\n", 72 | " \"fit\",\n", 73 | " \"predict\",\n", 74 | " \" plt\",\n", 75 | " \" pd\",\n", 76 | " \" sk\",\n", 77 | " \" fit\",\n", 78 | " \" predict\",\n", 79 | "]:\n", 80 | " ids = tokenizer([keyword]).input_ids[0]\n", 81 | " keytoken_ids.append(ids[0])\n", 82 | "\n", 83 | "batch = tokenizer([\"import numpy as np\"], return_tensors=\"pt\")\n", 84 | "model = accelerator.prepare(model)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "from torch.nn import CrossEntropyLoss\n", 94 | "import torch\n", 95 | "\n", 96 | "\n", 97 | "def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):\n", 98 | " # Shift so that tokens < n predict n\n", 99 | " shift_labels = inputs[..., 1:].contiguous()\n", 100 | " shift_logits = logits[..., :-1, :].contiguous()\n", 101 | " # Calculate per-token loss\n", 102 | " loss_fct = CrossEntropyLoss(reduce=False)\n", 103 | " loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))\n", 104 | " # Resize and average loss per sample\n", 105 | " loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)\n", 106 | " # Calculate and scale weighting\n", 107 | " weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(\n", 108 | " axis=[0, 2]\n", 109 | " )\n", 110 | " weights = alpha * (1.0 + weights)\n", 111 | " # Calculate weighted average\n", 112 | " weighted_loss = (loss_per_sample * weights).mean()\n", 113 | " return weighted_loss" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "logits = model(batch[\"input_ids\"]).logits\n", 123 | "loss = keytoken_weighted_loss(batch[\"input_ids\"], logits, keytoken_ids)\n", 124 | "accelerator.backward(loss)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "from transformers import Trainer\n", 134 | "\n", 135 | "class MyTrainer(Trainer):\n", 136 | " def compute_loss(self, model, inputs, return_outputs=False):\n", 137 | " input_ids = inputs.get(\"input_ids\")\n", 138 | " outputs = model(input_ids)\n", 139 | " loss = keytoken_weighted_loss(input_ids, outputs.logits, keytoken_ids)\n", 140 | "\n", 141 | " return (loss, outputs) if return_outputs else loss" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [] 150 | } 151 | ], 152 | "metadata": { 153 | "colab": { 154 | "name": "Using a custom loss function", 155 | "provenance": [] 156 | } 157 | }, 158 | "nbformat": 4, 159 | "nbformat_minor": 4 160 | } 161 | -------------------------------------------------------------------------------- /course/chapter3/section3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Fine-tuning a model with the Trainer API" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from datasets import load_dataset\n", 33 | "from transformers import AutoTokenizer, DataCollatorWithPadding\n", 34 | "\n", 35 | "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n", 36 | "checkpoint = \"bert-base-uncased\"\n", 37 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", 38 | "\n", 39 | "\n", 40 | "def tokenize_function(example):\n", 41 | " return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n", 42 | "\n", 43 | "\n", 44 | "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n", 45 | "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "from transformers import TrainingArguments\n", 55 | "\n", 56 | "training_args = TrainingArguments(\"test-trainer\")" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "from transformers import AutoModelForSequenceClassification\n", 66 | "\n", 67 | "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "from transformers import Trainer\n", 77 | "\n", 78 | "trainer = Trainer(\n", 79 | " model,\n", 80 | " training_args,\n", 81 | " train_dataset=tokenized_datasets[\"train\"],\n", 82 | " eval_dataset=tokenized_datasets[\"validation\"],\n", 83 | " data_collator=data_collator,\n", 84 | " tokenizer=tokenizer,\n", 85 | ")" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "trainer.train()" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "data": { 104 | "text/plain": [ 105 | "(408, 2) (408,)" 106 | ] 107 | }, 108 | "execution_count": null, 109 | "metadata": {}, 110 | "output_type": "execute_result" 111 | } 112 | ], 113 | "source": [ 114 | "predictions = trainer.predict(tokenized_datasets[\"validation\"])\n", 115 | "print(predictions.predictions.shape, predictions.label_ids.shape)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "import numpy as np\n", 125 | "\n", 126 | "preds = np.argmax(predictions.predictions, axis=-1)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "{'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}" 138 | ] 139 | }, 140 | "execution_count": null, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | } 144 | ], 145 | "source": [ 146 | "from datasets import load_metric\n", 147 | "\n", 148 | "metric = load_metric(\"glue\", \"mrpc\")\n", 149 | "metric.compute(predictions=preds, references=predictions.label_ids)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "def compute_metrics(eval_preds):\n", 159 | " metric = load_metric(\"glue\", \"mrpc\")\n", 160 | " logits, labels = eval_preds\n", 161 | " predictions = np.argmax(logits, axis=-1)\n", 162 | " return metric.compute(predictions=predictions, references=labels)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "training_args = TrainingArguments(\"test-trainer\", evaluation_strategy=\"epoch\")\n", 172 | "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n", 173 | "\n", 174 | "trainer = Trainer(\n", 175 | " model,\n", 176 | " training_args,\n", 177 | " train_dataset=tokenized_datasets[\"train\"],\n", 178 | " eval_dataset=tokenized_datasets[\"validation\"],\n", 179 | " data_collator=data_collator,\n", 180 | " tokenizer=tokenizer,\n", 181 | " compute_metrics=compute_metrics,\n", 182 | ")" 183 | ] 184 | } 185 | ], 186 | "metadata": { 187 | "colab": { 188 | "name": "Fine-tuning a model with the Trainer API", 189 | "provenance": [] 190 | } 191 | }, 192 | "nbformat": 4, 193 | "nbformat_minor": 4 194 | } 195 | -------------------------------------------------------------------------------- /sagemaker/14_train_and_push_to_hub/scripts/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import random 5 | import sys 6 | 7 | import numpy as np 8 | import torch 9 | from datasets import load_from_disk, load_metric 10 | from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments 11 | from transformers.trainer_utils import get_last_checkpoint 12 | 13 | if __name__ == "__main__": 14 | 15 | parser = argparse.ArgumentParser() 16 | 17 | # hyperparameters sent by the client are passed as command-line arguments to the script. 18 | parser.add_argument("--epochs", type=int, default=3) 19 | parser.add_argument("--train_batch_size", type=int, default=32) 20 | parser.add_argument("--eval_batch_size", type=int, default=64) 21 | parser.add_argument("--warmup_steps", type=int, default=500) 22 | parser.add_argument("--model_id", type=str) 23 | parser.add_argument("--learning_rate", type=str, default=5e-5) 24 | parser.add_argument("--fp16", type=bool, default=True) 25 | 26 | # Push to Hub Parameters 27 | parser.add_argument("--push_to_hub", type=bool, default=True) 28 | parser.add_argument("--hub_model_id", type=str, default=None) 29 | parser.add_argument("--hub_strategy", type=str, default=None) 30 | parser.add_argument("--hub_token", type=str, default=None) 31 | 32 | # Data, model, and output directories 33 | parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) 34 | parser.add_argument("--output_dir", type=str, default=os.environ["SM_MODEL_DIR"]) 35 | parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"]) 36 | parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) 37 | parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"]) 38 | 39 | args, _ = parser.parse_known_args() 40 | 41 | # make sure we have required parameters to push 42 | if args.push_to_hub: 43 | if args.hub_strategy is None: 44 | raise ValueError("--hub_strategy is required when pushing to Hub") 45 | if args.hub_token is None: 46 | raise ValueError("--hub_token is required when pushing to Hub") 47 | 48 | # sets hub id if not provided 49 | if args.hub_model_id is None: 50 | args.hub_model_id = args.model_id.replace("/", "--") 51 | 52 | # Set up logging 53 | logger = logging.getLogger(__name__) 54 | 55 | logging.basicConfig( 56 | level=logging.getLevelName("INFO"), 57 | handlers=[logging.StreamHandler(sys.stdout)], 58 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 59 | ) 60 | 61 | # load datasets 62 | train_dataset = load_from_disk(args.training_dir) 63 | test_dataset = load_from_disk(args.test_dir) 64 | 65 | logger.info(f" loaded train_dataset length is: {len(train_dataset)}") 66 | logger.info(f" loaded test_dataset length is: {len(test_dataset)}") 67 | 68 | # define metrics and metrics function 69 | metric = load_metric("accuracy") 70 | 71 | def compute_metrics(eval_pred): 72 | predictions, labels = eval_pred 73 | predictions = np.argmax(predictions, axis=1) 74 | return metric.compute(predictions=predictions, references=labels) 75 | 76 | # Prepare model labels - useful in inference API 77 | labels = train_dataset.features["labels"].names 78 | num_labels = len(labels) 79 | label2id, id2label = dict(), dict() 80 | for i, label in enumerate(labels): 81 | label2id[label] = str(i) 82 | id2label[str(i)] = label 83 | 84 | # download model from model hub 85 | model = AutoModelForSequenceClassification.from_pretrained( 86 | args.model_id, num_labels=num_labels, label2id=label2id, id2label=id2label 87 | ) 88 | tokenizer = AutoTokenizer.from_pretrained(args.model_id) 89 | 90 | # define training args 91 | training_args = TrainingArguments( 92 | output_dir=args.output_dir, 93 | overwrite_output_dir=True if get_last_checkpoint(args.output_dir) is not None else False, 94 | num_train_epochs=args.epochs, 95 | per_device_train_batch_size=args.train_batch_size, 96 | per_device_eval_batch_size=args.eval_batch_size, 97 | warmup_steps=args.warmup_steps, 98 | fp16=args.fp16, 99 | evaluation_strategy="epoch", 100 | save_strategy="epoch", 101 | save_total_limit=2, 102 | logging_dir=f"{args.output_data_dir}/logs", 103 | learning_rate=float(args.learning_rate), 104 | load_best_model_at_end=True, 105 | metric_for_best_model="accuracy", 106 | # push to hub parameters 107 | push_to_hub=args.push_to_hub, 108 | hub_strategy=args.hub_strategy, 109 | hub_model_id=args.hub_model_id, 110 | hub_token=args.hub_token, 111 | ) 112 | 113 | # create Trainer instance 114 | trainer = Trainer( 115 | model=model, 116 | args=training_args, 117 | compute_metrics=compute_metrics, 118 | train_dataset=train_dataset, 119 | eval_dataset=test_dataset, 120 | tokenizer=tokenizer, 121 | ) 122 | 123 | # train model 124 | trainer.train() 125 | 126 | # evaluate model 127 | eval_result = trainer.evaluate(eval_dataset=test_dataset) 128 | 129 | # save best model, metrics and create model card 130 | trainer.create_model_card(model_name=args.hub_model_id) 131 | trainer.push_to_hub() 132 | 133 | # Saves the model to s3 uses os.environ["SM_MODEL_DIR"] to make sure checkpointing works 134 | trainer.save_model(os.environ["SM_MODEL_DIR"]) 135 | -------------------------------------------------------------------------------- /course/videos/token_processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from datasets import load_dataset\n", 61 | "\n", 62 | "raw_datasets = load_dataset(\"conll2003\")\n", 63 | "raw_datasets = raw_datasets.remove_columns([\"chunk_tags\", \"id\", \"pos_tags\"])\n", 64 | "raw_datasets = raw_datasets.rename_column(\"ner_tags\", \"labels\")\n", 65 | "raw_datasets = raw_datasets.rename_column(\"tokens\", \"words\")\n", 66 | "raw_datasets[\"train\"]" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "print(raw_datasets[\"train\"][0][\"words\"])\n", 76 | "print(raw_datasets[\"train\"][0][\"labels\"])" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "label_names = raw_datasets[\"train\"].features[\"labels\"].feature.names\n", 86 | "label_names" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "from transformers import AutoTokenizer\n", 96 | "\n", 97 | "model_checkpoint = \"bert-base-cased\"\n", 98 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", 99 | "\n", 100 | "inputs = tokenizer(raw_datasets[\"train\"][0][\"words\"], is_split_into_words=True)\n", 101 | "inputs.tokens()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "def shift_label(label):\n", 111 | " # If the label is B-XXX we change it to I-XXX\n", 112 | " if label % 2 == 1:\n", 113 | " label += 1\n", 114 | " return label\n", 115 | "\n", 116 | "def align_labels_with_tokens(labels, word_ids):\n", 117 | " new_labels = []\n", 118 | " current_word = None\n", 119 | " for word_id in word_ids:\n", 120 | " if word_id is None:\n", 121 | " new_labels.append(-100)\n", 122 | " elif word_id != current_word:\n", 123 | " # Start of a new word!\n", 124 | " current_word = word_id\n", 125 | " new_labels.append(labels[word_id])\n", 126 | " else:\n", 127 | " new_labels.append(shift_label(labels[word_id]))\n", 128 | "\n", 129 | " return new_labels" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "def tokenize_and_align_labels(examples):\n", 139 | " tokenized_inputs = tokenizer(examples[\"words\"], truncation=True, is_split_into_words=True)\n", 140 | " new_labels = []\n", 141 | " for i, labels in enumerate(examples[\"labels\"]):\n", 142 | " word_ids = tokenized_inputs.word_ids(i)\n", 143 | " new_labels.append(align_labels_with_tokens(labels, word_ids))\n", 144 | "\n", 145 | " tokenized_inputs[\"labels\"] = new_labels\n", 146 | " return tokenized_inputs\n", 147 | "\n", 148 | "tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "from transformers import DataCollatorForTokenClassification\n", 158 | "\n", 159 | "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [] 168 | } 169 | ], 170 | "metadata": { 171 | "colab": { 172 | "name": "Data processing for Token Classification", 173 | "provenance": [] 174 | } 175 | }, 176 | "nbformat": 4, 177 | "nbformat_minor": 4 178 | } 179 | -------------------------------------------------------------------------------- /course/videos/translation_processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from datasets import load_dataset, load_metric\n", 61 | "\n", 62 | "raw_datasets = load_dataset(\"kde4\", lang1=\"en\", lang2=\"fr\")\n", 63 | "\n", 64 | "def extract_languages(examples):\n", 65 | " inputs = [ex[\"en\"] for ex in examples[\"translation\"]]\n", 66 | " targets = [ex[\"fr\"] for ex in examples[\"translation\"]]\n", 67 | " return {\"inputs\": inputs, \"targets\": targets}\n", 68 | "\n", 69 | "raw_datasets = raw_datasets.map(extract_languages, batched=True, remove_columns=[\"id\", \"translation\"])\n", 70 | "raw_datasets" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "print(raw_datasets[\"train\"][10])\n", 80 | "print(raw_datasets[\"train\"][11])\n", 81 | "print(raw_datasets[\"train\"][12])" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "from transformers import AutoTokenizer\n", 91 | "\n", 92 | "model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n", 93 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", 94 | "\n", 95 | "sample = raw_datasets[\"train\"][12]\n", 96 | "inputs = tokenizer(sample[\"inputs\"])\n", 97 | "targets = tokenizer(sample[\"targets\"])\n", 98 | "\n", 99 | "\n", 100 | "print(tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"]))\n", 101 | "print(tokenizer.convert_ids_to_tokens(targets[\"input_ids\"]))" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "from transformers import AutoTokenizer\n", 111 | "\n", 112 | "model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n", 113 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", 114 | "\n", 115 | "sample = raw_datasets[\"train\"][12]\n", 116 | "inputs = tokenizer(sample[\"inputs\"])\n", 117 | "with tokenizer.as_target_tokenizer():\n", 118 | " targets = tokenizer(sample[\"targets\"])\n", 119 | "\n", 120 | "\n", 121 | "print(tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"]))\n", 122 | "print(tokenizer.convert_ids_to_tokens(targets[\"input_ids\"]))" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "max_input_length = 128\n", 132 | "max_target_length = 128\n", 133 | "\n", 134 | "def preprocess_function(examples):\n", 135 | " model_inputs = tokenizer(examples[\"inputs\"], max_length=max_input_length, truncation=True)\n", 136 | "\n", 137 | " # Setup the tokenizer for targets\n", 138 | " with tokenizer.as_target_tokenizer():\n", 139 | " labels = tokenizer(examples[\"targets\"], max_length=max_target_length, truncation=True)\n", 140 | "\n", 141 | " model_inputs[\"labels\"] = labels[\"input_ids\"]\n", 142 | " return model_inputs\n", 143 | "\n", 144 | "tokenized_datasets = raw_datasets.map(\n", 145 | " preprocess_function, batched=True, remove_columns=[\"inputs\", \"targets\"]\n", 146 | ")" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "from transformers import DataCollatorForSeq2Seq\n", 156 | "\n", 157 | "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [] 166 | } 167 | ], 168 | "metadata": { 169 | "colab": { 170 | "name": "Data processing for Translation", 171 | "provenance": [] 172 | } 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 4 176 | } 177 | -------------------------------------------------------------------------------- /sagemaker/09_image_classification_vision_transformer/scripts/train.py: -------------------------------------------------------------------------------- 1 | from transformers import ViTForImageClassification, Trainer, TrainingArguments,default_data_collator,ViTFeatureExtractor 2 | from datasets import load_from_disk,load_metric 3 | import random 4 | import logging 5 | import sys 6 | import argparse 7 | import os 8 | import numpy as np 9 | import subprocess 10 | 11 | subprocess.run([ 12 | "git", 13 | "config", 14 | "--global", 15 | "user.email", 16 | "sagemaker@huggingface.co", 17 | ], check=True) 18 | subprocess.run([ 19 | "git", 20 | "config", 21 | "--global", 22 | "user.name", 23 | "sagemaker", 24 | ], check=True) 25 | 26 | 27 | if __name__ == "__main__": 28 | 29 | parser = argparse.ArgumentParser() 30 | 31 | # hyperparameters sent by the client are passed as command-line arguments to the script. 32 | parser.add_argument("--model_name", type=str) 33 | parser.add_argument("--output_dir", type=str,default="/opt/ml/model") 34 | parser.add_argument("--extra_model_name", type=str,default="sagemaker") 35 | parser.add_argument("--dataset", type=str,default="cifar10") 36 | parser.add_argument("--task", type=str,default="image-classification") 37 | parser.add_argument("--use_auth_token", type=str, default="") 38 | 39 | parser.add_argument("--num_train_epochs", type=int, default=3) 40 | parser.add_argument("--per_device_train_batch_size", type=int, default=32) 41 | parser.add_argument("--per_device_eval_batch_size", type=int, default=64) 42 | parser.add_argument("--warmup_steps", type=int, default=500) 43 | parser.add_argument("--weight_decay", type=float, default=0.01) 44 | parser.add_argument("--learning_rate", type=str, default=2e-5) 45 | 46 | parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) 47 | parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"]) 48 | 49 | args, _ = parser.parse_known_args() 50 | 51 | # Set up logging 52 | logger = logging.getLogger(__name__) 53 | 54 | logging.basicConfig( 55 | level=logging.getLevelName("INFO"), 56 | handlers=[logging.StreamHandler(sys.stdout)], 57 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 58 | ) 59 | 60 | # load datasets 61 | train_dataset = load_from_disk(args.training_dir) 62 | test_dataset = load_from_disk(args.test_dir) 63 | num_classes = train_dataset.features["label"].num_classes 64 | 65 | 66 | logger.info(f" loaded train_dataset length is: {len(train_dataset)}") 67 | logger.info(f" loaded test_dataset length is: {len(test_dataset)}") 68 | 69 | metric_name = "accuracy" 70 | # compute metrics function for binary classification 71 | 72 | metric = load_metric(metric_name) 73 | 74 | def compute_metrics(eval_pred): 75 | predictions, labels = eval_pred 76 | predictions = np.argmax(predictions, axis=1) 77 | return metric.compute(predictions=predictions, references=labels) 78 | 79 | # download model from model hub 80 | model = ViTForImageClassification.from_pretrained(args.model_name,num_labels=num_classes) 81 | 82 | # change labels 83 | id2label = {key:train_dataset.features["label"].names[index] for index,key in enumerate(model.config.id2label.keys())} 84 | label2id = {train_dataset.features["label"].names[index]:value for index,value in enumerate(model.config.label2id.values())} 85 | model.config.id2label = id2label 86 | model.config.label2id = label2id 87 | 88 | 89 | # define training args 90 | training_args = TrainingArguments( 91 | output_dir=args.output_dir, 92 | num_train_epochs=args.num_train_epochs, 93 | per_device_train_batch_size=args.per_device_train_batch_size, 94 | per_device_eval_batch_size=args.per_device_eval_batch_size, 95 | warmup_steps=args.warmup_steps, 96 | weight_decay=args.weight_decay, 97 | evaluation_strategy="epoch", 98 | logging_dir=f"{args.output_dir}/logs", 99 | learning_rate=float(args.learning_rate), 100 | load_best_model_at_end=True, 101 | metric_for_best_model=metric_name, 102 | ) 103 | 104 | 105 | # create Trainer instance 106 | trainer = Trainer( 107 | model=model, 108 | args=training_args, 109 | compute_metrics=compute_metrics, 110 | train_dataset=train_dataset, 111 | eval_dataset=test_dataset, 112 | data_collator=default_data_collator, 113 | ) 114 | 115 | # train model 116 | trainer.train() 117 | 118 | # evaluate model 119 | eval_result = trainer.evaluate(eval_dataset=test_dataset) 120 | 121 | # writes eval result to file which can be accessed later in s3 ouput 122 | with open(os.path.join(args.output_dir, "eval_results.txt"), "w") as writer: 123 | print(f"***** Eval results *****") 124 | for key, value in sorted(eval_result.items()): 125 | writer.write(f"{key} = {value}\n") 126 | 127 | # Saves the model to s3 128 | trainer.save_model(args.output_dir) 129 | 130 | if args.use_auth_token != "": 131 | kwargs = { 132 | "finetuned_from": args.model_name.split("/")[1], 133 | "tags": "image-classification", 134 | "dataset": args.dataset, 135 | } 136 | repo_name = ( 137 | f"{args.model_name.split('/')[1]}-{args.task}" 138 | if args.extra_model_name == "" 139 | else f"{args.model_name.split('/')[1]}-{args.task}-{args.extra_model_name}" 140 | ) 141 | 142 | trainer.push_to_hub( 143 | repo_name=repo_name, 144 | use_auth_token=args.use_auth_token, 145 | **kwargs, 146 | ) 147 | -------------------------------------------------------------------------------- /course/videos/sentence_pairs_tf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from transformers import AutoTokenizer\n", 61 | "\n", 62 | "checkpoint = \"bert-base-uncased\"\n", 63 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", 64 | "sequences = [\n", 65 | " \"I've been waiting for a HuggingFace course my whole life.\",\n", 66 | " \"This course is amazing!\",\n", 67 | "]\n", 68 | "batch = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"tf\")" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "{'input_ids': [101, 2026, 2171, 2003, 25353, 22144, 2378, 1012, 102, 1045, 2147, 2012, 17662, 2227, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}" 80 | ] 81 | }, 82 | "execution_count": null, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "from transformers import AutoTokenizer\n", 89 | "\n", 90 | "checkpoint = \"bert-base-uncased\"\n", 91 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", 92 | "tokenizer(\"My name is Sylvain.\", \"I work at Hugging Face.\")" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "{'input_ids': [[101, 2026, 2171, 2003, 25353, 22144, 2378, 1012, 102, 1045, 2147, 2012, 17662, 2227, 1012, 102], [101, 2183, 2000, 1996, 5988, 1012, 102, 2023, 3185, 2003, 2307, 1012, 102, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]}" 104 | ] 105 | }, 106 | "execution_count": null, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "from transformers import AutoTokenizer\n", 113 | "\n", 114 | "checkpoint = \"bert-base-uncased\"\n", 115 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", 116 | "tokenizer(\n", 117 | " [\"My name is Sylvain.\", \"Going to the cinema.\"],\n", 118 | " [\"I work at Hugging Face.\", \"This movie is great.\"],\n", 119 | " padding=True\n", 120 | ")" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "name": "stderr", 130 | "output_type": "stream", 131 | "text": [ 132 | "All model checkpoint layers were used when initializing TFBertForSequenceClassification.\n", 133 | "\n", 134 | "Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']\n", 135 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "from transformers import TFAutoModelForSequenceClassification, AutoTokenizer\n", 141 | "\n", 142 | "checkpoint = \"bert-base-uncased\"\n", 143 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", 144 | "batch = tokenizer(\n", 145 | " [\"My name is Sylvain.\", \"Going to the cinema.\"],\n", 146 | " [\"I work at Hugging Face.\", \"This movie is great.\"],\n", 147 | " padding=True,\n", 148 | " return_tensors=\"tf\",\n", 149 | ")\n", 150 | "\n", 151 | "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n", 152 | "outputs = model(**batch)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [] 161 | } 162 | ], 163 | "metadata": { 164 | "colab": { 165 | "name": "Preprocessing sentence pairs (TensorFlow)", 166 | "provenance": [] 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 4 171 | } 172 | -------------------------------------------------------------------------------- /course/videos/tensorflow_finetuning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "cellView": "form" 66 | }, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/html": [ 71 | "" 72 | ], 73 | "text/plain": [ 74 | "" 75 | ] 76 | }, 77 | "execution_count": null, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "#@title\n", 84 | "from IPython.display import HTML\n", 85 | "\n", 86 | "HTML('')" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Install the Transformers and Datasets libraries to run this notebook." 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "! pip install datasets transformers[sentencepiece]" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "from datasets import load_dataset\n", 112 | "from transformers import AutoTokenizer\n", 113 | "import numpy as np\n", 114 | "\n", 115 | "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n", 116 | "checkpoint = \"bert-base-uncased\"\n", 117 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", 118 | "\n", 119 | "def tokenize_dataset(dataset):\n", 120 | " encoded = tokenizer(\n", 121 | " dataset[\"sentence1\"],\n", 122 | " dataset[\"sentence2\"],\n", 123 | " max_length=128,\n", 124 | " truncation=True,\n", 125 | " )\n", 126 | " return encoded.data\n", 127 | "\n", 128 | "tokenized_datasets = raw_datasets.map(tokenize_dataset, batched=True)\n", 129 | "\n", 130 | "train_dataset = tokenized_datasets[\"train\"].to_tf_dataset(\n", 131 | " columns=[\"input_ids\", \"attention_mask\", \"token_type_ids\"],\n", 132 | " label_cols=[\"label\"],\n", 133 | " shuffle=True,\n", 134 | " batch_size=8)\n", 135 | "\n", 136 | "validation_dataset = tokenized_datasets[\"validation\"].to_tf_dataset(\n", 137 | " columns=[\"input_ids\", \"attention_mask\", \"token_type_ids\"],\n", 138 | " label_cols=[\"label\"],\n", 139 | " shuffle=True,\n", 140 | " batch_size=8)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "next(iter(train_dataset))[1]" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "import tensorflow as tf\n", 159 | "from transformers import TFAutoModelForSequenceClassification\n", 160 | "\n", 161 | "checkpoint = 'bert-base-cased'\n", 162 | "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n", 163 | "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n", 164 | "model.compile(optimizer='adam', loss=loss)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "model.fit(\n", 174 | " train_dataset,\n", 175 | " validation_data=validation_dataset,\n", 176 | " epochs=3\n", 177 | ")" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [] 186 | } 187 | ], 188 | "metadata": { 189 | "colab": { 190 | "name": "Fine-Tuning with TensorFlow", 191 | "provenance": [] 192 | } 193 | }, 194 | "nbformat": 4, 195 | "nbformat_minor": 4 196 | } 197 | -------------------------------------------------------------------------------- /course/chapter3/section3_tf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Fine-tuning a model with Keras" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from datasets import load_dataset\n", 33 | "from transformers import AutoTokenizer, DataCollatorWithPadding\n", 34 | "import numpy as np\n", 35 | "\n", 36 | "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n", 37 | "checkpoint = \"bert-base-uncased\"\n", 38 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", 39 | "\n", 40 | "\n", 41 | "def tokenize_function(example):\n", 42 | " return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n", 43 | "\n", 44 | "\n", 45 | "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n", 46 | "\n", 47 | "data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors=\"tf\")\n", 48 | "\n", 49 | "tf_train_dataset = tokenized_datasets[\"train\"].to_tf_dataset(\n", 50 | " columns=[\"attention_mask\", \"input_ids\", \"token_type_ids\"],\n", 51 | " label_cols=[\"labels\"],\n", 52 | " shuffle=True,\n", 53 | " collate_fn=data_collator,\n", 54 | " batch_size=8,\n", 55 | ")\n", 56 | "\n", 57 | "tf_validation_dataset = tokenized_datasets[\"validation\"].to_tf_dataset(\n", 58 | " columns=[\"attention_mask\", \"input_ids\", \"token_type_ids\"],\n", 59 | " label_cols=[\"labels\"],\n", 60 | " shuffle=False,\n", 61 | " collate_fn=data_collator,\n", 62 | " batch_size=8,\n", 63 | ")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "from transformers import TFAutoModelForSequenceClassification\n", 73 | "\n", 74 | "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "from tensorflow.keras.losses import SparseCategoricalCrossentropy\n", 84 | "\n", 85 | "model.compile(\n", 86 | " optimizer=\"adam\",\n", 87 | " loss=SparseCategoricalCrossentropy(from_logits=True),\n", 88 | " metrics=[\"accuracy\"],\n", 89 | ")\n", 90 | "model.fit(\n", 91 | " tf_train_dataset,\n", 92 | " validation_data=tf_validation_dataset,\n", 93 | ")" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "from tensorflow.keras.optimizers.schedules import PolynomialDecay\n", 103 | "\n", 104 | "batch_size = 8\n", 105 | "num_epochs = 3\n", 106 | "# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied\n", 107 | "# by the total number of epochs\n", 108 | "num_train_steps = len(tf_train_dataset) * num_epochs\n", 109 | "lr_scheduler = PolynomialDecay(\n", 110 | " initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps\n", 111 | ")\n", 112 | "from tensorflow.keras.optimizers import Adam\n", 113 | "\n", 114 | "opt = Adam(learning_rate=lr_scheduler)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "import tensorflow as tf\n", 124 | "\n", 125 | "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n", 126 | "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n", 127 | "model.compile(optimizer=opt, loss=loss, metrics=[\"accuracy\"])" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "preds = model.predict(tf_validation_dataset)[\"logits\"]" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "(408, 2) (408,)" 157 | ] 158 | }, 159 | "execution_count": null, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "class_preds = np.argmax(preds, axis=1)\n", 166 | "print(preds.shape, class_preds.shape)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "data": { 176 | "text/plain": [ 177 | "{'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}" 178 | ] 179 | }, 180 | "execution_count": null, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "from datasets import load_metric\n", 187 | "\n", 188 | "metric = load_metric(\"glue\", \"mrpc\")\n", 189 | "metric.compute(predictions=class_preds, references=raw_datasets[\"validation\"][\"label\"])" 190 | ] 191 | } 192 | ], 193 | "metadata": { 194 | "colab": { 195 | "name": "Fine-tuning a model with Keras", 196 | "provenance": [] 197 | } 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 4 201 | } 202 | -------------------------------------------------------------------------------- /course/videos/semantic_search.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "import torch\n", 61 | "from transformers import AutoTokenizer, AutoModel\n", 62 | "\n", 63 | "sentences = [\n", 64 | " \"I took my dog for a walk\",\n", 65 | " \"Today is going to rain\",\n", 66 | " \"I took my cat for a walk\",\n", 67 | "]\n", 68 | "\n", 69 | "model_ckpt = \"sentence-transformers/all-MiniLM-L6-v2\"\n", 70 | "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n", 71 | "model = AutoModel.from_pretrained(model_ckpt)\n", 72 | "\n", 73 | "encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors=\"pt\")\n", 74 | "\n", 75 | "with torch.no_grad():\n", 76 | " model_output = model(**encoded_input)\n", 77 | " \n", 78 | " \n", 79 | "token_embeddings = model_output.last_hidden_state\n", 80 | "print(f\"Token embeddings shape: {token_embeddings.size()}\")" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "import torch.nn.functional as F\n", 90 | "\n", 91 | "\n", 92 | "def mean_pooling(model_output, attention_mask):\n", 93 | " token_embeddings = model_output.last_hidden_state\n", 94 | " input_mask_expanded = (\n", 95 | " attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n", 96 | " )\n", 97 | " return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(\n", 98 | " input_mask_expanded.sum(1), min=1e-9\n", 99 | " )\n", 100 | "\n", 101 | "\n", 102 | "sentence_embeddings = mean_pooling(model_output, encoded_input[\"attention_mask\"])\n", 103 | "# Normalize the embeddings\n", 104 | "sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)\n", 105 | "print(f\"Sentence embeddings shape: {sentence_embeddings.size()}\")" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "import numpy as np\n", 115 | "from sklearn.metrics.pairwise import cosine_similarity\n", 116 | "\n", 117 | "sentence_embeddings = sentence_embeddings.detach().numpy()\n", 118 | "\n", 119 | "scores = np.zeros((sentence_embeddings.shape[0], sentence_embeddings.shape[0]))\n", 120 | "\n", 121 | "for idx in range(sentence_embeddings.shape[0]):\n", 122 | " scores[idx, :] = cosine_similarity([sentence_embeddings[idx]], sentence_embeddings)[0]" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "from datasets import load_dataset\n", 132 | "\n", 133 | "squad = load_dataset(\"squad\", split=\"validation\").shuffle(seed=42).select(range(100))\n", 134 | "\n", 135 | "\n", 136 | "def get_embeddings(text_list):\n", 137 | " encoded_input = tokenizer(\n", 138 | " text_list, padding=True, truncation=True, return_tensors=\"pt\"\n", 139 | " )\n", 140 | " encoded_input = {k: v for k, v in encoded_input.items()}\n", 141 | " with torch.no_grad():\n", 142 | " model_output = model(**encoded_input)\n", 143 | " return mean_pooling(model_output, encoded_input[\"attention_mask\"])\n", 144 | "\n", 145 | "\n", 146 | "squad_with_embeddings = squad.map(\n", 147 | " lambda x: {\"embeddings\": get_embeddings(x[\"context\"]).cpu().numpy()[0]}\n", 148 | ")" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "squad_with_embeddings.add_faiss_index(column=\"embeddings\")\n", 158 | "\n", 159 | "question = \"Who headlined the halftime show for Super Bowl 50?\"\n", 160 | "question_embedding = get_embeddings([question]).cpu().detach().numpy()\n", 161 | "\n", 162 | "scores, samples = squad_with_embeddings.get_nearest_examples(\n", 163 | " \"embeddings\", question_embedding, k=3\n", 164 | ")" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [] 173 | } 174 | ], 175 | "metadata": { 176 | "colab": { 177 | "name": "Text embeddings & semantic search", 178 | "provenance": [] 179 | } 180 | }, 181 | "nbformat": 4, 182 | "nbformat_minor": 4 183 | } 184 | -------------------------------------------------------------------------------- /course/chapter2/section6_pt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Putting it all together (PyTorch)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from transformers import AutoTokenizer\n", 33 | "\n", 34 | "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n", 35 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", 36 | "\n", 37 | "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n", 38 | "\n", 39 | "model_inputs = tokenizer(sequence)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n", 49 | "\n", 50 | "model_inputs = tokenizer(sequence)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n", 60 | "\n", 61 | "model_inputs = tokenizer(sequences)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# Will pad the sequences up to the maximum sequence length\n", 71 | "model_inputs = tokenizer(sequences, padding=\"longest\")\n", 72 | "\n", 73 | "# Will pad the sequences up to the model max length\n", 74 | "# (512 for BERT or DistilBERT)\n", 75 | "model_inputs = tokenizer(sequences, padding=\"max_length\")\n", 76 | "\n", 77 | "# Will pad the sequences up to the specified max length\n", 78 | "model_inputs = tokenizer(sequences, padding=\"max_length\", max_length=8)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n", 88 | "\n", 89 | "# Will truncate the sequences that are longer than the model max length\n", 90 | "# (512 for BERT or DistilBERT)\n", 91 | "model_inputs = tokenizer(sequences, truncation=True)\n", 92 | "\n", 93 | "# Will truncate the sequences that are longer than the specified max length\n", 94 | "model_inputs = tokenizer(sequences, max_length=8, truncation=True)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n", 104 | "\n", 105 | "# Returns PyTorch tensors\n", 106 | "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"pt\")\n", 107 | "\n", 108 | "# Returns TensorFlow tensors\n", 109 | "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"tf\")\n", 110 | "\n", 111 | "# Returns NumPy arrays\n", 112 | "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"np\")" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]\n", 124 | "[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]" 125 | ] 126 | }, 127 | "execution_count": null, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n", 134 | "\n", 135 | "model_inputs = tokenizer(sequence)\n", 136 | "print(model_inputs[\"input_ids\"])\n", 137 | "\n", 138 | "tokens = tokenizer.tokenize(sequence)\n", 139 | "ids = tokenizer.convert_tokens_to_ids(tokens)\n", 140 | "print(ids)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "\"[CLS] i've been waiting for a huggingface course my whole life. [SEP]\"\n", 152 | "\"i've been waiting for a huggingface course my whole life.\"" 153 | ] 154 | }, 155 | "execution_count": null, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "print(tokenizer.decode(model_inputs[\"input_ids\"]))\n", 162 | "print(tokenizer.decode(ids))" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "import torch\n", 172 | "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", 173 | "\n", 174 | "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n", 175 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", 176 | "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n", 177 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n", 178 | "\n", 179 | "tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"pt\")\n", 180 | "output = model(**tokens)" 181 | ] 182 | } 183 | ], 184 | "metadata": { 185 | "colab": { 186 | "name": "Putting it all together (PyTorch)", 187 | "provenance": [] 188 | } 189 | }, 190 | "nbformat": 4, 191 | "nbformat_minor": 4 192 | } 193 | -------------------------------------------------------------------------------- /course/chapter2/section6_tf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Putting it all together (TensorFlow)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Install the Transformers and Datasets libraries to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install datasets transformers[sentencepiece]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from transformers import AutoTokenizer\n", 33 | "\n", 34 | "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n", 35 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", 36 | "\n", 37 | "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n", 38 | "\n", 39 | "model_inputs = tokenizer(sequence)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n", 49 | "\n", 50 | "model_inputs = tokenizer(sequence)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n", 60 | "\n", 61 | "model_inputs = tokenizer(sequences)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# Will pad the sequences up to the maximum sequence length\n", 71 | "model_inputs = tokenizer(sequences, padding=\"longest\")\n", 72 | "\n", 73 | "# Will pad the sequences up to the model max length\n", 74 | "# (512 for BERT or DistilBERT)\n", 75 | "model_inputs = tokenizer(sequences, padding=\"max_length\")\n", 76 | "\n", 77 | "# Will pad the sequences up to the specified max length\n", 78 | "model_inputs = tokenizer(sequences, padding=\"max_length\", max_length=8)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n", 88 | "\n", 89 | "# Will truncate the sequences that are longer than the model max length\n", 90 | "# (512 for BERT or DistilBERT)\n", 91 | "model_inputs = tokenizer(sequences, truncation=True)\n", 92 | "\n", 93 | "# Will truncate the sequences that are longer than the specified max length\n", 94 | "model_inputs = tokenizer(sequences, max_length=8, truncation=True)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n", 104 | "\n", 105 | "# Returns PyTorch tensors\n", 106 | "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"pt\")\n", 107 | "\n", 108 | "# Returns TensorFlow tensors\n", 109 | "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"tf\")\n", 110 | "\n", 111 | "# Returns NumPy arrays\n", 112 | "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"np\")" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]\n", 124 | "[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]" 125 | ] 126 | }, 127 | "execution_count": null, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n", 134 | "\n", 135 | "model_inputs = tokenizer(sequence)\n", 136 | "print(model_inputs[\"input_ids\"])\n", 137 | "\n", 138 | "tokens = tokenizer.tokenize(sequence)\n", 139 | "ids = tokenizer.convert_tokens_to_ids(tokens)\n", 140 | "print(ids)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "\"[CLS] i've been waiting for a huggingface course my whole life. [SEP]\"\n", 152 | "\"i've been waiting for a huggingface course my whole life.\"" 153 | ] 154 | }, 155 | "execution_count": null, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "print(tokenizer.decode(model_inputs[\"input_ids\"]))\n", 162 | "print(tokenizer.decode(ids))" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "import tensorflow as tf\n", 172 | "from transformers import AutoTokenizer, TFAutoModelForSequenceClassification\n", 173 | "\n", 174 | "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n", 175 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", 176 | "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n", 177 | "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n", 178 | "\n", 179 | "tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"tf\")\n", 180 | "output = model(**tokens)" 181 | ] 182 | } 183 | ], 184 | "metadata": { 185 | "colab": { 186 | "name": "Putting it all together (TensorFlow)", 187 | "provenance": [] 188 | } 189 | }, 190 | "nbformat": 4, 191 | "nbformat_minor": 4 192 | } 193 | -------------------------------------------------------------------------------- /course/videos/token_pipeline_pt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "cellView": "form" 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "" 21 | ], 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": null, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "#@title\n", 33 | "from IPython.display import HTML\n", 34 | "\n", 35 | "HTML('')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Install the Transformers and Datasets libraries to run this notebook." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "! pip install datasets transformers[sentencepiece]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from transformers import pipeline\n", 61 | "\n", 62 | "token_classifier = pipeline(\"token-classification\")\n", 63 | "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "token_classifier = pipeline(\"token-classification\", aggregation_strategy=\"simple\")\n", 73 | "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "from transformers import AutoTokenizer, AutoModelForTokenClassification\n", 83 | "\n", 84 | "model_checkpoint = \"\"\n", 85 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", 86 | "model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)\n", 87 | "\n", 88 | "example = \"My name is Sylvain and I work at Hugging Face in Brooklyn.\"\n", 89 | "inputs = tokenizer(example, return_tensors=\"pt\")\n", 90 | "outputs = model(**inputs)\n", 91 | "\n", 92 | "print(inputs[\"input_ids\"].shape)\n", 93 | "print(outputs.logits.shape)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "import torch\n", 103 | "\n", 104 | "probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()\n", 105 | "predictions = probabilities.argmax(dim=-1)[0].tolist()\n", 106 | "print(predictions)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "model.config.id2label" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "results = []\n", 125 | "inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)\n", 126 | "tokens = inputs_with_offsets.tokens()\n", 127 | "offsets = inputs_with_offsets[\"offset_mapping\"]\n", 128 | "\n", 129 | "for idx, pred in enumerate(predictions):\n", 130 | " label = model.config.id2label[pred]\n", 131 | " if label != \"O\":\n", 132 | " start, end = offsets[idx]\n", 133 | " results.append(\n", 134 | " {\"entity\": label, \"score\": probabilities[idx][pred],\n", 135 | " \"word\": tokens[idx], \"start\": start, \"end\": end}\n", 136 | " )\n", 137 | "\n", 138 | "print(results)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "import numpy as np\n", 148 | "\n", 149 | "label_map = model.config.id2label\n", 150 | "results = []\n", 151 | "idx = 0\n", 152 | "while idx < len(predictions):\n", 153 | " pred = predictions[idx]\n", 154 | " label = label_map[pred]\n", 155 | " if label != \"O\":\n", 156 | " # Remove the B- or I-\n", 157 | " label = label[2:]\n", 158 | " start, _ = offsets[idx]\n", 159 | "\n", 160 | " # Grab all the tokens labeled with I-label\n", 161 | " all_scores = []\n", 162 | " while idx < len(predictions) and label_map[predictions[idx]] == f\"I-{label}\":\n", 163 | " all_scores.append(probabilities[idx][pred])\n", 164 | " _, end = offsets[idx]\n", 165 | " idx += 1\n", 166 | "\n", 167 | " # The score is the mean of all the scores of the token in that grouped entity.\n", 168 | " score = np.mean(all_scores).item()\n", 169 | " word = example[start:end]\n", 170 | " results.append(\n", 171 | " {\"entity_group\": label, \"score\": score,\n", 172 | " \"word\": word, \"start\": start, \"end\": end}\n", 173 | " )\n", 174 | " idx += 1" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [] 183 | } 184 | ], 185 | "metadata": { 186 | "colab": { 187 | "name": "Inside the Token classification pipeline (PyTorch)", 188 | "provenance": [] 189 | } 190 | }, 191 | "nbformat": 4, 192 | "nbformat_minor": 4 193 | } 194 | --------------------------------------------------------------------------------