├── requirements.txt
├── chapters
├── es
│ ├── _toctree.yml
│ └── chapter0
│ │ └── section1.mdx
└── en
│ ├── chapter6
│ ├── 9.mdx
│ ├── 1.mdx
│ └── 4.mdx
│ ├── chapter8
│ ├── 6.mdx
│ ├── 1.mdx
│ ├── 5.mdx
│ └── 7.mdx
│ ├── chapter4
│ ├── 5.mdx
│ ├── 1.mdx
│ ├── 2.mdx
│ ├── 4.mdx
│ └── 6.mdx
│ ├── chapter2
│ ├── 7.mdx
│ ├── 1.mdx
│ ├── 6.mdx
│ └── 3.mdx
│ ├── chapter1
│ ├── 6.mdx
│ ├── 5.mdx
│ ├── 7.mdx
│ ├── 9.mdx
│ ├── 2.mdx
│ ├── 8.mdx
│ ├── 1.mdx
│ └── 10.mdx
│ ├── chapter3
│ ├── 5.mdx
│ ├── 1.mdx
│ ├── 3.mdx
│ ├── 3_tf.mdx
│ └── 6.mdx
│ ├── chapter5
│ ├── 7.mdx
│ ├── 1.mdx
│ ├── 2.mdx
│ └── 8.mdx
│ ├── chapter7
│ ├── 8.mdx
│ └── 1.mdx
│ ├── _toctree.yml
│ ├── chapter0
│ └── 1.mdx
│ └── event
│ └── 1.mdx
├── Makefile
├── .github
├── workflows
│ ├── delete_doc_comment.yml
│ ├── quality.yml
│ ├── build_documentation.yml
│ └── build_pr_documentation.yml
└── ISSUE_TEMPLATE
│ └── translations.md
├── upcoming_chapters
└── en
│ ├── chapter11.md
│ ├── chapter12.md
│ ├── chapter10.md
│ └── chapter9.md
├── utils
├── carbon-config.json
├── code_formatter.py
└── generate_notebooks.py
├── .gitignore
└── README.md
/requirements.txt:
--------------------------------------------------------------------------------
1 | nbformat>=5.1.3
2 | PyYAML>=5.4.1
3 | black
--------------------------------------------------------------------------------
/chapters/es/_toctree.yml:
--------------------------------------------------------------------------------
1 | - title: Setup
2 | sections:
3 | - Creación de un entorno de trabajo
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: quality style
2 |
3 | # Check code formatting
4 | quality:
5 | python utils/code_formatter.py --check_only
6 |
7 | # Format code samples automatically and check is there are any problems left that need manual fixing
8 | style:
9 | python utils/code_formatter.py
10 |
--------------------------------------------------------------------------------
/.github/workflows/delete_doc_comment.yml:
--------------------------------------------------------------------------------
1 | name: Delete dev documentation
2 |
3 | on:
4 | pull_request:
5 | types: [ closed ]
6 |
7 |
8 | jobs:
9 | delete:
10 | uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
11 | with:
12 | pr_number: ${{ github.event.number }}
13 | package: course
--------------------------------------------------------------------------------
/upcoming_chapters/en/chapter11.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Chapter 11: A custom training loop'
3 | description:
4 | 'But what about my own specific problems?'
5 | prev: /chapter10
6 | next: /chapter12
7 | type: chapter
8 | id: 11
9 | ---
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/.github/workflows/quality.yml:
--------------------------------------------------------------------------------
1 | name: Quality Check
2 |
3 | on:
4 | pull_request:
5 |
6 | jobs:
7 | quality:
8 | runs-on: ubuntu-latest
9 | steps:
10 | - uses: actions/checkout@v2
11 | - name: Set up Python 3.6
12 | uses: actions/setup-python@v2
13 | with:
14 | python-version: 3.6
15 | - name: Install Python dependencies
16 | run: pip install black
17 | - name: Run Quality check
18 | run: make quality
--------------------------------------------------------------------------------
/.github/workflows/build_documentation.yml:
--------------------------------------------------------------------------------
1 | name: Build documentation
2 |
3 | on:
4 | push:
5 | branches:
6 | - release
7 | - doc-builder*
8 |
9 | jobs:
10 | build:
11 | uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
12 | with:
13 | commit_sha: ${{ github.sha }}
14 | package: course
15 | path_to_docs: course/chapters/en
16 | additional_args: --not_python_module
17 | secrets:
18 | token: ${{ secrets.HUGGINGFACE_PUSH }}
--------------------------------------------------------------------------------
/chapters/en/chapter6/9.mdx:
--------------------------------------------------------------------------------
1 | # Tokenizers, check!
2 |
3 | Great job finishing this chapter!
4 |
5 | After this deep dive into tokenizers, you should:
6 |
7 | - Be able to train a new tokenizer using an old one as a template
8 | - Understand how to use offsets to map tokens' positions to their original span of text
9 | - Know the differences between BPE, WordPiece, and Unigram
10 | - Be able to mix and match the blocks provided by the 🤗 Tokenizers library to build your own tokenizer
11 | - Be able to use that tokenizer inside the 🤗 Transformers library
12 |
--------------------------------------------------------------------------------
/upcoming_chapters/en/chapter12.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Chapter 12: Contribute to Transformers'
3 | description:
4 | 'Giving back'
5 | prev: /chapter11
6 | next: null
7 | type: chapter
8 | id: 11
9 | ---
10 |
11 |
12 |
13 |
14 | loprtin rte miondjfnjfs
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/chapters/en/chapter8/6.mdx:
--------------------------------------------------------------------------------
1 | # Part 2 completed!
2 |
3 | Congratulations, you've made it through the second part of the course! We're actively working on the third one, so subscribe to our [newsletter](https://huggingface.curated.co/) to make sure you don't miss its release.
4 |
5 | You should now be able to tackle a range of NLP tasks, and fine-tune or pretrain a model on them. Don't forget to share your results with the community on the [Model Hub](https://huggingface.co/models).
6 |
7 | We can't wait to see what you will build with the knowledge that you've gained!
8 |
--------------------------------------------------------------------------------
/upcoming_chapters/en/chapter10.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Chapter 10: Speeding up training'
3 | description:
4 | 'We need to go faster.'
5 | prev: /chapter9
6 | next: /chapter11
7 | type: chapter
8 | id: 10
9 | ---
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/upcoming_chapters/en/chapter9.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Chapter 09: Specialized architectures'
3 | description:
4 | 'Become an expert at transformer models.'
5 | prev: /chapter8
6 | next: /chapter10
7 | type: chapter
8 | id: 9
9 | ---
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/.github/workflows/build_pr_documentation.yml:
--------------------------------------------------------------------------------
1 | name: Build PR Documentation
2 |
3 | on:
4 | pull_request:
5 |
6 | concurrency:
7 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
8 | cancel-in-progress: true
9 |
10 | jobs:
11 | build:
12 | uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
13 | with:
14 | commit_sha: ${{ github.event.pull_request.head.sha }}
15 | pr_number: ${{ github.event.number }}
16 | package: course
17 | path_to_docs: course/chapters/en
18 | additional_args: --not_python_module
--------------------------------------------------------------------------------
/utils/carbon-config.json:
--------------------------------------------------------------------------------
1 | {"paddingVertical":"1px","paddingHorizontal":"1px","backgroundImage":null,"backgroundImageSelection":null,"backgroundMode":"color","backgroundColor":"rgba(255,255,255,1)","dropShadow":false,"dropShadowOffsetY":"20px","dropShadowBlurRadius":"68px","theme":"one-light","windowTheme":"none","language":"python","fontFamily":"Fira Code","fontSize":"14px","lineHeight":"152%","windowControls":false,"widthAdjustment":true,"lineNumbers":false,"firstLineNumber":1,"exportSize":"2x","watermark":false,"squaredImage":false,"hiddenCharacters":false,"name":"","width":680,"highlights":{"keyword":"rgba(139,92,246,1)","variable":"rgba(236,72,153,1)","number":"rgba(180,83,9,1)","string":"rgba(80,161,79,1)"}}
--------------------------------------------------------------------------------
/chapters/en/chapter4/5.mdx:
--------------------------------------------------------------------------------
1 | # Part 1 completed!
2 |
3 | This is the end of the first part of the course! Part 2 will be released on November 15th with a big community event, see more information [here](https://huggingface.co/blog/course-launch-event).
4 |
5 | You should now be able to fine-tune a pretrained model on a text classification problem (single or pairs of sentences) and upload the result to the Model Hub. To make sure you mastered this first section, you should do exactly that on a problem that interests you (and not necessarily in English if you speak another language)! You can find help in the [Hugging Face forums](https://discuss.huggingface.co/) and share your project in [this topic](https://discuss.huggingface.co/t/share-your-projects/6803) once you're finished.
6 |
7 | We can't wait to see what you will build with this!
8 |
--------------------------------------------------------------------------------
/chapters/en/chapter2/7.mdx:
--------------------------------------------------------------------------------
1 | # Basic usage completed!
2 |
3 | Great job following the course up to here! To recap, in this chapter you:
4 |
5 | - Learned the basic building blocks of a Transformer model.
6 | - Learned what makes up a tokenization pipeline.
7 | - Saw how to use a Transformer model in practice.
8 | - Learned how to leverage a tokenizer to convert text to tensors that are understandable by the model.
9 | - Set up a tokenizer and a model together to get from text to predictions.
10 | - Learned the limitations of input IDs, and learned about attention masks.
11 | - Played around with versatile and configurable tokenizer methods.
12 |
13 | From now on, you should be able to freely navigate the 🤗 Transformers docs: the vocabulary will sound familiar, and you've already seen the methods that you'll use the majority of the time.
14 |
--------------------------------------------------------------------------------
/chapters/en/chapter1/6.mdx:
--------------------------------------------------------------------------------
1 | # Decoder models
2 |
3 |
4 |
5 | Decoder models use only the decoder of a Transformer model. At each stage, for a given word the attention layers can only access the words positioned before it in the sentence. These models are often called *auto-regressive models*.
6 |
7 | The pretraining of decoder models usually revolves around predicting the next word in the sentence.
8 |
9 | These models are best suited for tasks involving text generation.
10 |
11 | Representatives of this family of models include:
12 |
13 | - [CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)
14 | - [GPT](https://huggingface.co/transformers/model_doc/gpt.html)
15 | - [GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)
16 | - [Transformer XL](https://huggingface.co/transformers/model_doc/transformerxl.html)
17 |
--------------------------------------------------------------------------------
/chapters/en/chapter3/5.mdx:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Fine-tuning, Check!
4 |
5 | That was fun! In the first two chapters you learned about models and tokenizers, and now you know how to fine-tune them for your own data. To recap, in this chapter you:
6 |
7 | {#if fw === 'pt'}
8 | * Learned about datasets in the [Hub](https://huggingface.co/datasets)
9 | * Learned how to load and preprocess datasets, including using dynamic padding and collators
10 | * Implemented your own fine-tuning and evaluation of a model
11 | * Implemented a lower-level training loop
12 | * Used 🤗 Accelerate to easily adapt your training loop so it works for multiple GPUs or TPUs
13 |
14 | {:else}
15 | * Learned about datasets in the [Hub](https://huggingface.co/datasets)
16 | * Learned how to load and preprocess datasets
17 | * Learned how to fine-tune and evaluate a model with Keras
18 | * Implemented a custom metric
19 |
20 | {/if}
21 |
--------------------------------------------------------------------------------
/chapters/en/chapter5/7.mdx:
--------------------------------------------------------------------------------
1 | # 🤗 Datasets, check!
2 |
3 | Well, that was quite a tour through the 🤗 Datasets library -- congratulations on making it this far! With the knowledge that you've gained from this chapter, you should be able to:
4 |
5 | - Load datasets from anywhere, be it the Hugging Face Hub, your laptop, or a remote server at your company.
6 | - Wrangle your data using a mix of the `Dataset.map()` and `Dataset.filter()` functions.
7 | - Quickly switch between data formats like Pandas and NumPy using `Dataset.set_format()`.
8 | - Create your very own dataset and push it to the Hugging Face Hub.
9 | - Embed your documents using a Transformer model and build a semantic search engine using FAISS.
10 |
11 | In [Chapter 7](/course/chapter7), we'll put all of this to good use as we take a deep dive into the core NLP tasks that Transformer models are great for. Before jumping ahead, though, put your knowledge of 🤗 Datasets to the test with a quick quiz!
--------------------------------------------------------------------------------
/chapters/en/chapter3/1.mdx:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Introduction
4 |
5 | In [Chapter 2](/course/chapter2) we explored how to use tokenizers and pretrained models to make predictions. But what if you want to fine-tune a pretrained model for your own dataset? That's the topic of this chapter! You will learn:
6 |
7 | {#if fw === 'pt'}
8 | * How to prepare a large dataset from the Hub
9 | * How to use the high-level `Trainer` API to fine-tune a model
10 | * How to use a custom training loop
11 | * How to leverage the 🤗 Accelerate library to easily run that custom training loop on any distributed setup
12 |
13 | {:else}
14 | * How to prepare a large dataset from the Hub
15 | * How to use Keras to fine-tune a model
16 | * How to use Keras to get predictions
17 | * How to use a custom metric
18 |
19 | {/if}
20 |
21 | In order to upload your trained checkpoints to the Hugging Face Hub, you will need a huggingface.co account: [create an account](https://huggingface.co/join)
--------------------------------------------------------------------------------
/chapters/en/chapter8/1.mdx:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 | Now that you know how to tackle the most common NLP tasks with 🤗 Transformers, you should be able to get started on your own projects! In this chapter we will explore what to do when you hit a problem. You'll learn how to successfully debug your code or your training, and how to ask the community for help if you don't manage to solve the problem by yourself. And if you think you've found a bug in one of the Hugging Face libraries, we'll show you the best way to report it so that the issue is resolved as quickly as possible.
4 |
5 | More precisely, in this chapter you will learn:
6 |
7 | - The first thing to do when you get an error
8 | - How to ask for help on the [forums](https://discuss.huggingface.co/)
9 | - How to debug your training pipeline
10 | - How to write a good issue
11 |
12 | None of this is specifically related to 🤗 Transformers or the Hugging Face ecosystem, of course; the lessons from this chapter are applicable to most open source projects!
13 |
--------------------------------------------------------------------------------
/chapters/en/chapter5/1.mdx:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 | In [Chapter 3](/course/chapter3) you got your first taste of the 🤗 Datasets library and saw that there were three main steps when it came to fine-tuning a model:
4 |
5 | 1. Load a dataset from the Hugging Face Hub.
6 | 2. Preprocess the data with `Dataset.map()`.
7 | 3. Load and compute metrics.
8 |
9 | But this is just scratching the surface of what 🤗 Datasets can do! In this chapter, we will take a deep dive into the library. Along the way, we'll find answers to the following questions:
10 |
11 | * What do you do when your dataset is not on the Hub?
12 | * How can you slice and dice a dataset? (And what if you _really_ need to use Pandas?)
13 | * What do you do when your dataset is huge and will melt your laptop's RAM?
14 | * What the heck are "memory mapping" and Apache Arrow?
15 | * How can you create your own dataset and push it to the Hub?
16 |
17 | The techniques you learn here will prepare you for the advanced tokenization and fine-tuning tasks in [Chapter 6](/course/chapter6) and [Chapter 7](/course/chapter7) -- so grab a coffee and let's get started!
--------------------------------------------------------------------------------
/chapters/en/chapter1/5.mdx:
--------------------------------------------------------------------------------
1 | # Encoder models
2 |
3 |
4 |
5 | Encoder models use only the encoder of a Transformer model. At each stage, the attention layers can access all the words in the initial sentence. These models are often characterized as having "bi-directional" attention, and are often called *auto-encoding models*.
6 |
7 | The pretraining of these models usually revolves around somehow corrupting a given sentence (for instance, by masking random words in it) and tasking the model with finding or reconstructing the initial sentence.
8 |
9 | Encoder models are best suited for tasks requiring an understanding of the full sentence, such as sentence classification, named entity recognition (and more generally word classification), and extractive question answering.
10 |
11 | Representatives of this family of models include:
12 |
13 | - [ALBERT](https://huggingface.co/transformers/model_doc/albert.html)
14 | - [BERT](https://huggingface.co/transformers/model_doc/bert.html)
15 | - [DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)
16 | - [ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)
17 | - [RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)
18 |
--------------------------------------------------------------------------------
/chapters/en/chapter1/7.mdx:
--------------------------------------------------------------------------------
1 | # Sequence-to-sequence models
2 |
3 |
4 |
5 | Encoder-decoder models (also called *sequence-to-sequence models*) use both parts of the Transformer architecture. At each stage, the attention layers of the encoder can access all the words in the initial sentence, whereas the attention layers of the decoder can only access the words positioned before a given word in the input.
6 |
7 | The pretraining of these models can be done using the objectives of encoder or decoder models, but usually involves something a bit more complex. For instance, [T5](https://huggingface.co/t5-base) is pretrained by replacing random spans of text (that can contain several words) with a single mask special word, and the objective is then to predict the text that this mask word replaces.
8 |
9 | Sequence-to-sequence models are best suited for tasks revolving around generating new sentences depending on a given input, such as summarization, translation, or generative question answering.
10 |
11 | Representatives of this family of models include:
12 |
13 | - [BART](https://huggingface.co/transformers/model_doc/bart.html)
14 | - [mBART](https://huggingface.co/transformers/model_doc/mbart.html)
15 | - [Marian](https://huggingface.co/transformers/model_doc/marian.html)
16 | - [T5](https://huggingface.co/transformers/model_doc/t5.html)
17 |
--------------------------------------------------------------------------------
/chapters/en/chapter1/9.mdx:
--------------------------------------------------------------------------------
1 | # Summary
2 |
3 | In this chapter, you saw how to approach different NLP tasks using the high-level `pipeline()` function from 🤗 Transformers. You also saw how to search for and use models in the Hub, as well as how to use the Inference API to test the models directly in your browser.
4 |
5 | We discussed how Transformer models work at a high level, and talked about the importance of transfer learning and fine-tuning. A key aspect is that you can use the full architecture or only the encoder or decoder, depending on what kind of task you aim to solve. The following table summarizes this:
6 |
7 | | Model | Examples | Tasks |
8 | |-----------------|--------------------------------------------|----------------------------------------------------------------------------------|
9 | | Encoder | ALBERT, BERT, DistilBERT, ELECTRA, RoBERTa | Sentence classification, named entity recognition, extractive question answering |
10 | | Decoder | CTRL, GPT, GPT-2, Transformer XL | Text generation |
11 | | Encoder-decoder | BART, T5, Marian, mBART | Summarization, translation, generative question answering |
12 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 |
3 | # Logs
4 | logs
5 | *.log
6 | npm-debug.log*
7 | yarn-debug.log*
8 | yarn-error.log*
9 |
10 | # Runtime data
11 | pids
12 | *.pid
13 | *.seed
14 | *.pid.lock
15 |
16 | # Directory for instrumented libs generated by jscoverage/JSCover
17 | lib-cov
18 |
19 | # Coverage directory used by tools like istanbul
20 | coverage
21 |
22 | # nyc test coverage
23 | .nyc_output
24 |
25 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
26 | .grunt
27 |
28 | # Bower dependency directory (https://bower.io/)
29 | bower_components
30 |
31 | # node-waf configuration
32 | .lock-wscript
33 |
34 | # Compiled binary addons (http://nodejs.org/api/addons.html)
35 | build/Release
36 |
37 | # Dependency directories
38 | node_modules/
39 | jspm_packages/
40 |
41 | # Typescript v1 declaration files
42 | typings/
43 |
44 | # Optional npm cache directory
45 | .npm
46 |
47 | # Optional eslint cache
48 | .eslintcache
49 |
50 | # Optional REPL history
51 | .node_repl_history
52 |
53 | # Output of 'npm pack'
54 | *.tgz
55 |
56 | # dotenv environment variables file
57 | .env
58 |
59 | # gatsby files
60 | .cache/
61 | public
62 |
63 | # Mac files
64 | .DS_Store
65 |
66 | # Yarn
67 | yarn-error.log
68 | yarn.lock
69 | .pnp/
70 | .pnp.js
71 | # Yarn Integrity file
72 | .yarn-integrity
73 |
74 | # Sylvain notes folder
75 | notes
76 |
77 | # Ignore Colab notebooks
78 | nbs/
79 |
80 | # Byte-compiled
81 | __pycache__/
82 | .cache/
--------------------------------------------------------------------------------
/chapters/en/chapter7/8.mdx:
--------------------------------------------------------------------------------
1 | # Mastering NLP
2 |
3 | If you've made it this far in the course, congratulations -- you now have all the knowledge and tools you need to tackle (almost) any NLP task with 🤗 Transformers and the Hugging Face ecosystem!
4 |
5 | We have seen a lot of different data collators, so we made this little video to help you find which one to use for each task:
6 |
7 |
8 |
9 | After completing this lightning tour through the core NLP tasks, you should:
10 |
11 | * Know which architectures (encoder, decoder, or encoder-decoder) are best suited for each task
12 | * Understand the difference between pretraining and fine-tuning a language model
13 | * Know how to train Transformer models using either the `Trainer` API and distributed training features of 🤗 Accelerate or TensorFlow and Keras, depending on which track you've been following
14 | * Understand the meaning and limitations of metrics like ROUGE and BLEU for text generation tasks
15 | * Know how to interact with your fine-tuned models, both on the Hub and using the `pipeline` from 🤗 Transformers
16 |
17 | Despite all this knowledge, there will come a time when you'll either encounter a difficult bug in your code or have a question about how to solve a particular NLP problem. Fortunately, the Hugging Face community is here to help you! In the final chapter of this part of the course, we'll explore how you can debug your Transformer models and ask for help effectively.
--------------------------------------------------------------------------------
/chapters/en/chapter6/1.mdx:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 | In [Chapter 3](/course/chapter3), we looked at how to fine-tune a model on a given task. When we do that, we use the same tokenizer that the model was pretrained with -- but what do we do when we want to train a model from scratch? In these cases, using a tokenizer that was pretrained on a corpus from another domain or language is typically suboptimal. For example, a tokenizer that's trained on an English corpus will perform poorly on a corpus of Japanese texts because the use of spaces and punctuation is very different in the two languages.
4 |
5 | In this chapter, you will learn how to train a brand new tokenizer on a corpus of texts, so it can then be used to pretrain a language model. This will all be done with the help of the [🤗 Tokenizers](https://github.com/huggingface/tokenizers) library, which provides the "fast" tokenizers in the [🤗 Transformers](https://github.com/huggingface/transformers) library. We'll take a close look at the features that this library provides, and explore how the fast tokenizers differ from the "slow" versions.
6 |
7 | Topics we will cover include:
8 |
9 | * How to train a new tokenizer similar to the one used by a given checkpoint on a new corpus of texts
10 | * The special features of fast tokenizers
11 | * The differences between the three main subword tokenization algorithms used in NLP today
12 | * How to build a tokenizer from scratch with the 🤗 Tokenizers library and train it on some data
13 |
14 | The techniques introduced in this chapter will prepare you for the section in [Chapter 7](/course/chapter7/6) where we look at creating a language model for Python source code. Let's start by looking at what it means to "train" a tokenizer in the first place.
--------------------------------------------------------------------------------
/chapters/en/chapter4/1.mdx:
--------------------------------------------------------------------------------
1 | # The Hugging Face Hub
2 |
3 | The [Hugging Face Hub](https://huggingface.co/) –- our main website –- is a central platform that enables anyone to discover, use, and contribute new state-of-the-art models and datasets. It hosts a wide variety of models, with more than 10,000 publicly available. We'll focus on the models in this chapter, and take a look at the datasets in Chapter 5.
4 |
5 | The models in the Hub are not limited to 🤗 Transformers or even NLP. There are models from [Flair](https://github.com/flairNLP/flair) and [AllenNLP](https://github.com/allenai/allennlp) for NLP, [Asteroid](https://github.com/asteroid-team/asteroid) and [pyannote](https://github.com/pyannote/pyannote-audio) for speech, and [timm](https://github.com/rwightman/pytorch-image-models) for vision, to name a few.
6 |
7 | Each of these models is hosted as a Git repository, which allows versioning and reproducibility. Sharing a model on the Hub means opening it up to the community and making it accessible to anyone looking to easily use it, in turn eliminating their need to train a model on their own and simplifying sharing and usage.
8 |
9 | Additionally, sharing a model on the Hub automatically deploys a hosted Inference API for that model. Anyone in the community is free to test it out directly on the model's page, with custom inputs and appropriate widgets.
10 |
11 | The best part is that sharing and using any public model on the Hub is completely free! [Paid plans](https://huggingface.co/pricing) also exist if you wish to share models privately.
12 |
13 | The video below shows how to navigate the Hub.
14 |
15 |
16 |
17 | Having a huggingface.co account is required to follow along this part, as we'll be creating and managing repositories on the Hugging Face Hub: [create an account](https://huggingface.co/join)
--------------------------------------------------------------------------------
/chapters/en/chapter7/1.mdx:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Introduction
4 |
5 | In [Chapter 3](/course/chapter3), you saw how to fine-tune a model for text classification. In this chapter, we will tackle the following common NLP tasks:
6 |
7 | - Token classification
8 | - Masked language modeling (like BERT)
9 | - Summarization
10 | - Translation
11 | - Causal language modeling pretraining (like GPT-2)
12 | - Question answering
13 |
14 | {#if fw === 'pt'}
15 |
16 | To do this, you'll need to leverage everything you learned about the `Trainer` API and the 🤗 Accelerate library in [Chapter 3](/course/chapter3), the 🤗 Datasets library in [Chapter 5](/course/chapter5), and the 🤗 Tokenizers library in [Chapter 6](/course/chapter6). We'll also upload our results to the Model Hub, like we did in [Chapter 4](/course/chapter4), so this is really the chapter where everything comes together!
17 |
18 | Each section can be read independently and will show you how to train a model with the `Trainer` API or with your own training loop, using 🤗 Accelerate. Feel free to skip either part and focus on the one that interests you the most: the `Trainer` API is great for fine-tuning or training your model without worrying about what's going on behind the scenes, while the training loop with `Accelerate` will let you customize any part you want more easily.
19 |
20 | {:else}
21 |
22 | To do this, you'll need to leverage everything you learned about training models with the Keras API in [Chapter 3](/course/chapter3), the 🤗 Datasets library in [Chapter 5](/course/chapter5), and the 🤗 Tokenizers library in [Chapter 6](/course/chapter6). We'll also upload our results to the Model Hub, like we did in [Chapter 4](/course/chapter4), so this is really the chapter where everything comes together!
23 |
24 | Each section can be read independently.
25 |
26 | {/if}
27 |
28 |
29 |
30 |
31 | If you read the sections in sequence, you will notice that they have quite a bit of code and prose in common. The repetition is intentional, to allow you to dip in (or come back later) to any task that interests you and find a complete working example.
32 |
33 |
34 |
--------------------------------------------------------------------------------
/chapters/en/chapter1/2.mdx:
--------------------------------------------------------------------------------
1 | # Natural Language Processing
2 |
3 | Before jumping into Transformer models, let's do a quick overview of what natural language processing is and why we care about it.
4 |
5 | ## What is NLP?
6 |
7 | NLP is a field of linguistics and machine learning focused on understanding everything related to human language. The aim of NLP tasks is not only to understand single words individually, but to be able to understand the context of those words.
8 |
9 | The following is a list of common NLP tasks, with some examples of each:
10 |
11 | - **Classifying whole sentences**: Getting the sentiment of a review, detecting if an email is spam, determining if a sentence is grammatically correct or whether two sentences are logically related or not
12 | - **Classifying each word in a sentence**: Identifying the grammatical components of a sentence (noun, verb, adjective), or the named entities (person, location, organization)
13 | - **Generating text content**: Completing a prompt with auto-generated text, filling in the blanks in a text with masked words
14 | - **Extracting an answer from a text**: Given a question and a context, extracting the answer to the question based on the information provided in the context
15 | - **Generating a new sentence from an input text**: Translating a text into another language, summarizing a text
16 |
17 | NLP isn't limited to written text though. It also tackles complex challenges in speech recognition and computer vision, such as generating a transcript of an audio sample or a description of an image.
18 |
19 | ## Why is it challenging?
20 |
21 | Computers don't process information in the same way as humans. For example, when we read the sentence "I am hungry," we can easily understand its meaning. Similarly, given two sentences such as "I am hungry" and "I am sad," we're able to easily determine how similar they are. For machine learning (ML) models, such tasks are more difficult. The text needs to be processed in a way that enables the model to learn from it. And because language is complex, we need to think carefully about how this processing must be done. There has been a lot of research done on how to represent text, and we will look at some methods in the next chapter.
22 |
--------------------------------------------------------------------------------
/chapters/en/chapter1/8.mdx:
--------------------------------------------------------------------------------
1 | # Bias and limitations
2 |
3 |
9 |
10 | If your intent is to use a pretrained model or a fine-tuned version in production, please be aware that, while these models are powerful tools, they come with limitations. The biggest of these is that, to enable pretraining on large amounts of data, researchers often scrape all the content they can find, taking the best as well as the worst of what is available on the internet.
11 |
12 | To give a quick illustration, let's go back the example of a `fill-mask` pipeline with the BERT model:
13 |
14 | ```python
15 | from transformers import pipeline
16 |
17 | unmasker = pipeline("fill-mask", model="bert-base-uncased")
18 | result = unmasker("This man works as a [MASK].")
19 | print([r["token_str"] for r in result])
20 |
21 | result = unmasker("This woman works as a [MASK].")
22 | print([r["token_str"] for r in result])
23 | ```
24 |
25 | ```python out
26 | ['lawyer', 'carpenter', 'doctor', 'waiter', 'mechanic']
27 | ['nurse', 'waitress', 'teacher', 'maid', 'prostitute']
28 | ```
29 |
30 | When asked to fill in the missing word in these two sentences, the model gives only one gender-free answer (waiter/waitress). The others are work occupations usually associated with one specific gender -- and yes, prostitute ended up in the top 5 possibilities the model associates with "woman" and "work." This happens even though BERT is one of the rare Transformer models not built by scraping data from all over the internet, but rather using apparently neutral data (it's trained on the [English Wikipedia](https://huggingface.co/datasets/wikipedia) and [BookCorpus](https://huggingface.co/datasets/bookcorpus) datasets).
31 |
32 | When you use these tools, you therefore need to keep in the back of your mind that the original model you are using could very easily generate sexist, racist, or homophobic content. Fine-tuning the model on your data won't make this intrinsic bias disappear.
33 |
--------------------------------------------------------------------------------
/chapters/en/chapter2/1.mdx:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 | As you saw in [Chapter 1](/course/chapter1), Transformer models are usually very large. With millions to tens of *billions* of parameters, training and deploying these models is a complicated undertaking. Furthermore, with new models being released on a near-daily basis and each having its own implementation, trying them all out is no easy task.
4 |
5 | The 🤗 Transformers library was created to solve this problem. Its goal is to provide a single API through which any Transformer model can be loaded, trained, and saved. The library's main features are:
6 |
7 | - **Ease of use**: Downloading, loading, and using a state-of-the-art NLP model for inference can be done in just two lines of code.
8 | - **Flexibility**: At their core, all models are simple PyTorch `nn.Module` or TensorFlow `tf.keras.Model` classes and can be handled like any other models in their respective machine learning (ML) frameworks.
9 | - **Simplicity**: Hardly any abstractions are made across the library. The "All in one file" is a core concept: a model's forward pass is entirely defined in a single file, so that the code itself is understandable and hackable.
10 |
11 | This last feature makes 🤗 Transformers quite different from other ML libraries. The models are not built on modules
12 | that are shared across files; instead, each model has its own layers. In addition to making the models more approachable and understandable, this allows you to easily experiment on one model without affecting others.
13 |
14 | This chapter will begin with an end-to-end example where we use a model and a tokenizer together to replicate the `pipeline()` function introduced in [Chapter 1](/course/chapter1). Next, we'll discuss the model API: we'll dive into the model and configuration classes, and show you how to load a model and how it processes numerical inputs to output predictions.
15 |
16 | Then we'll look at the tokenizer API, which is the other main component of the `pipeline()` function. Tokenizers take care of the first and last processing steps, handling the conversion from text to numerical inputs for the neural network, and the conversion back to text when it is needed. Finally, we'll show you how to handle sending multiple sentences through a model in a prepared batch, then wrap it all up with a closer look at the high-level `tokenizer()` function.
17 |
18 |
19 | ⚠️ In order to benefit from all features available with the Model Hub and 🤗 Transformers, we recommend creating an account.
20 |
--------------------------------------------------------------------------------
/utils/code_formatter.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import black
3 | import os
4 | import re
5 | from pathlib import Path
6 |
7 | def blackify(filename, check_only=False):
8 | # Read the content of the file
9 | with open(filename, "r", encoding="utf-8") as f:
10 | content = f.read()
11 | lines = content.split("\n")
12 |
13 | # Split the content into code samples in py or python blocks.
14 | code_samples = []
15 | line_index = 0
16 | while line_index < len(lines):
17 | line = lines[line_index]
18 | if line.strip() in ["```py", "```python"]:
19 | line_index += 1
20 | start_index = line_index
21 | while line_index < len(lines) and lines[line_index].strip() != "```":
22 | line_index += 1
23 |
24 | code = "\n".join(lines[start_index: line_index])
25 | # Deal with ! instructions
26 | code = re.sub(r"^!", r"## !", code, flags=re.MULTILINE)
27 |
28 | code_samples.append({
29 | "start_index": start_index,
30 | "end_index": line_index - 1,
31 | "code": code
32 | })
33 | line_index += 1
34 | else:
35 | line_index += 1
36 |
37 | # Let's blackify the code! We put everything in one big text to go faster.
38 | delimiter = "\n\n### New cell ###\n"
39 | full_code = delimiter.join([sample["code"] for sample in code_samples])
40 | formatted_code = full_code.replace("\t", " ")
41 | formatted_code = black.format_str(formatted_code, mode=black.FileMode({black.TargetVersion.PY37}, line_length=90))
42 |
43 | # Black adds last new lines we don't want, so we strip individual code samples.
44 | cells = formatted_code.split(delimiter)
45 | cells = [cell.strip() for cell in cells]
46 | formatted_code = delimiter.join(cells)
47 |
48 | if check_only:
49 | return full_code == formatted_code
50 | elif full_code == formatted_code:
51 | # Nothing to do, all is good
52 | return
53 |
54 | formatted_code = re.sub(r"^## !", r"!", formatted_code, flags=re.MULTILINE)
55 | print(f"Formatting {filename}")
56 | # Re-build the content with formatted code
57 | new_lines = []
58 | start_index = 0
59 | for sample, code in zip(code_samples, formatted_code.split(delimiter)):
60 | new_lines.extend(lines[start_index:sample["start_index"]])
61 | new_lines.append(code)
62 | start_index = sample["end_index"] + 1
63 | new_lines.extend(lines[start_index:])
64 |
65 |
66 | with open(filename, "w", encoding="utf-8") as f:
67 | f.write("\n".join(new_lines))
68 |
69 |
70 | def format_all_files(check_only=False):
71 | failures = []
72 | for filename in Path("chapters").glob("**/*.mdx"):
73 | try:
74 | same = blackify(filename, check_only=check_only)
75 | if check_only and not same:
76 | failures.append(filename)
77 | except Exception:
78 | print(f"Failed to format {filename}.")
79 | raise
80 |
81 | if check_only and len(failures) > 0:
82 | raise ValueError(f"{len(failures)} files need to be formatted, run `make style`.")
83 |
84 |
85 | if __name__ == "__main__":
86 | parser = argparse.ArgumentParser()
87 | parser.add_argument("--check_only", action="store_true", help="Just check files are properly formatted.")
88 | args = parser.parse_args()
89 |
90 | format_all_files(check_only=args.check_only)
91 |
--------------------------------------------------------------------------------
/chapters/en/_toctree.yml:
--------------------------------------------------------------------------------
1 | - title: 0. Setup
2 | sections:
3 | - local: chapter0/1
4 | title: Introduction
5 |
6 | - title: 1. Transformer models
7 | sections:
8 | - local: chapter1/1
9 | title: Introduction
10 | - local: chapter1/2
11 | title: Natural Language Processing
12 | - local: chapter1/3
13 | title: Transformers, what can they do?
14 | - local: chapter1/4
15 | title: How do Transformers work?
16 | - local: chapter1/5
17 | title: Encoder models
18 | - local: chapter1/6
19 | title: Decoder models
20 | - local: chapter1/7
21 | title: Sequence-to-sequence models
22 | - local: chapter1/8
23 | title: Bias and limitations
24 | - local: chapter1/9
25 | title: Summary
26 | - local: chapter1/10
27 | title: End-of-chapter quiz
28 | quiz: 1
29 |
30 | - title: 2. Using 🤗 Transformers
31 | sections:
32 | - local: chapter2/1
33 | title: Introduction
34 | - local: chapter2/2
35 | title: Behind the pipeline
36 | - local: chapter2/3
37 | title: Models
38 | - local: chapter2/4
39 | title: Tokenizers
40 | - local: chapter2/5
41 | title: Handling multiple sequences
42 | - local: chapter2/6
43 | title: Putting it all together
44 | - local: chapter2/7
45 | title: Basic usage completed!
46 | - local: chapter2/8
47 | title: End-of-chapter quiz
48 | quiz: 2
49 |
50 | - title: 3. Fine-tuning a pretrained model
51 | sections:
52 | - local: chapter3/1
53 | title: Introduction
54 | - local: chapter3/2
55 | title: Processing the data
56 | - local: chapter3/3
57 | title: Fine-tuning a model with the Trainer API or Keras
58 | local_fw: { pt: chapter3/3, tf: chapter3/3_tf }
59 | - local: chapter3/4
60 | title: A full training
61 | - local: chapter3/5
62 | title: Fine-tuning, Check!
63 | - local: chapter3/6
64 | title: End-of-chapter quiz
65 | quiz: 3
66 |
67 | - title: 4. Sharing models and tokenizers
68 | sections:
69 | - local: chapter4/1
70 | title: The Hugging Face Hub
71 | - local: chapter4/2
72 | title: Using pretrained models
73 | - local: chapter4/3
74 | title: Sharing pretrained models
75 | - local: chapter4/4
76 | title: Building a model card
77 | - local: chapter4/5
78 | title: Part 1 completed!
79 | - local: chapter4/6
80 | title: End-of-chapter quiz
81 | quiz: 4
82 |
83 | - title: 5. The 🤗 Datasets library
84 | sections:
85 | - local: chapter5/1
86 | title: Introduction
87 | - local: chapter5/2
88 | title: What if my dataset isn't on the Hub?
89 | - local: chapter5/3
90 | title: Time to slice and dice
91 | - local: chapter5/4
92 | title: Big data? 🤗 Datasets to the rescue!
93 | - local: chapter5/5
94 | title: Creating your own dataset
95 | - local: chapter5/6
96 | title: Semantic search with FAISS
97 | - local: chapter5/7
98 | title: 🤗 Datasets, check!
99 | - local: chapter5/8
100 | title: End-of-chapter quiz
101 | quiz: 5
102 |
103 | - title: 6. The 🤗 Tokenizers library
104 | sections:
105 | - local: chapter6/1
106 | title: Introduction
107 | - local: chapter6/2
108 | title: Training a new tokenizer from an old one
109 | - local: chapter6/3
110 | title: Fast tokenizers' special powers
111 | - local: chapter6/3b
112 | title: Fast tokenizers in the QA pipeline
113 | - local: chapter6/4
114 | title: Normalization and pre-tokenization
115 | - local: chapter6/5
116 | title: Byte-Pair Encoding tokenization
117 | - local: chapter6/6
118 | title: WordPiece tokenization
119 | - local: chapter6/7
120 | title: Unigram tokenization
121 | - local: chapter6/8
122 | title: Building a tokenizer, block by block
123 | - local: chapter6/9
124 | title: Tokenizers, check!
125 | - local: chapter6/10
126 | title: End-of-chapter quiz
127 | quiz: 6
128 |
129 | - title: 7. Main NLP tasks
130 | sections:
131 | - local: chapter7/1
132 | title: Introduction
133 | - local: chapter7/2
134 | title: Token classification
135 | - local: chapter7/3
136 | title: Fine-tuning a masked language model
137 | - local: chapter7/4
138 | title: Translation
139 | - local: chapter7/5
140 | title: Summarization
141 | - local: chapter7/6
142 | title: Training a causal language model from scratch
143 | - local: chapter7/7
144 | title: Question answering
145 | - local: chapter7/8
146 | title: Mastering NLP
147 | - local: chapter7/9
148 | title: End-of-chapter quiz
149 | quiz: 7
150 |
151 | - title: 8. How to ask for help
152 | sections:
153 | - local: chapter8/1
154 | title: Introduction
155 | - local: chapter8/2
156 | title: What to do when you get an error
157 | - local: chapter8/3
158 | title: Asking for help on the forums
159 | - local: chapter8/4
160 | title: Debugging the training pipeline
161 | local_fw: { pt: chapter8/4, tf: chapter8/4_tf }
162 | - local: chapter8/5
163 | title: How to write a good issue
164 | - local: chapter8/6
165 | title: Part 2 completed!
166 | - local: chapter8/7
167 | title: End-of-chapter quiz
168 | quiz: 8
169 |
170 | - title: Hugging Face Course Event
171 | sections:
172 | - local: event/1
173 | title: Part 2 Release Event
174 |
--------------------------------------------------------------------------------
/chapters/en/chapter4/2.mdx:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Using pretrained models
4 |
5 | {#if fw === 'pt'}
6 |
7 |
13 |
14 | {:else}
15 |
16 |
22 |
23 | {/if}
24 |
25 | The Model Hub makes selecting the appropriate model simple, so that using it in any downstream library can be done in a few lines of code. Let's take a look at how to actually use one of these models, and how to contribute back to the community.
26 |
27 | Let's say we're looking for a French-based model that can perform mask filling.
28 |
29 |
30 |

31 |
32 |
33 | We select the `camembert-base` checkpoint to try it out. The identifier `camembert-base` is all we need to start using it! As you've seen in previous chapters, we can instantiate it using the `pipeline()` function:
34 |
35 | ```py
36 | from transformers import pipeline
37 |
38 | camembert_fill_mask = pipeline("fill-mask", model="camembert-base")
39 | results = camembert_fill_mask("Le camembert est :)")
40 | ```
41 |
42 | ```python out
43 | [
44 | {'sequence': 'Le camembert est délicieux :)', 'score': 0.49091005325317383, 'token': 7200, 'token_str': 'délicieux'},
45 | {'sequence': 'Le camembert est excellent :)', 'score': 0.1055697426199913, 'token': 2183, 'token_str': 'excellent'},
46 | {'sequence': 'Le camembert est succulent :)', 'score': 0.03453313186764717, 'token': 26202, 'token_str': 'succulent'},
47 | {'sequence': 'Le camembert est meilleur :)', 'score': 0.0330314114689827, 'token': 528, 'token_str': 'meilleur'},
48 | {'sequence': 'Le camembert est parfait :)', 'score': 0.03007650189101696, 'token': 1654, 'token_str': 'parfait'}
49 | ]
50 | ```
51 |
52 | As you can see, loading a model within a pipeline is extremely simple. The only thing you need to watch out for is that the chosen checkpoint is suitable for the task it's going to be used for. For example, here we are loading the `camembert-base` checkpoint in the `fill-mask` pipeline, which is completely fine. But if we were to load this checkpoint in the `text-classification` pipeline, the results would not make any sense because the head of `camembert-base` is not suitable for this task! We recommend using the task selector in the Hugging Face Hub interface in order to select the appropriate checkpoints:
53 |
54 |
55 |

56 |
57 |
58 | You can also instantiate the checkpoint using the model architecture directly:
59 |
60 | {#if fw === 'pt'}
61 | ```py
62 | from transformers import CamembertTokenizer, CamembertForMaskedLM
63 |
64 | tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
65 | model = CamembertForMaskedLM.from_pretrained("camembert-base")
66 | ```
67 |
68 | However, we recommend using the [`Auto*` classes](https://huggingface.co/transformers/model_doc/auto.html?highlight=auto#auto-classes) instead, as these are by design architecture-agnostic. While the previous code sample limits users to checkpoints loadable in the CamemBERT architecture, using the `Auto*` classes makes switching checkpoints simple:
69 |
70 | ```py
71 | from transformers import AutoTokenizer, AutoModelForMaskedLM
72 |
73 | tokenizer = AutoTokenizer.from_pretrained("camembert-base")
74 | model = AutoModelForMaskedLM.from_pretrained("camembert-base")
75 | ```
76 | {:else}
77 | ```py
78 | from transformers import CamembertTokenizer, TFCamembertForMaskedLM
79 |
80 | tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
81 | model = TFCamembertForMaskedLM.from_pretrained("camembert-base")
82 | ```
83 |
84 | However, we recommend using the [`TFAuto*` classes](https://huggingface.co/transformers/model_doc/auto.html?highlight=auto#auto-classes) instead, as these are by design architecture-agnostic. While the previous code sample limits users to checkpoints loadable in the CamemBERT architecture, using the `TFAuto*` classes makes switching checkpoints simple:
85 |
86 | ```py
87 | from transformers import AutoTokenizer, TFAutoModelForMaskedLM
88 |
89 | tokenizer = AutoTokenizer.from_pretrained("camembert-base")
90 | model = TFAutoModelForMaskedLM.from_pretrained("camembert-base")
91 | ```
92 | {/if}
93 |
94 |
95 | When using a pretrained model, make sure to check how it was trained, on which datasets, its limits, and its biases. All of this information should be indicated on its model card.
96 |
97 |
--------------------------------------------------------------------------------
/chapters/en/chapter1/1.mdx:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 | ## Welcome to the 🤗 Course!
4 |
5 |
6 |
7 | This course will teach you about natural language processing (NLP) using libraries from the [Hugging Face](https://huggingface.co/) ecosystem — [🤗 Transformers](https://github.com/huggingface/transformers), [🤗 Datasets](https://github.com/huggingface/datasets), [🤗 Tokenizers](https://github.com/huggingface/tokenizers), and [🤗 Accelerate](https://github.com/huggingface/accelerate) — as well as the [Hugging Face Hub](https://huggingface.co/models). It's completely free and without ads.
8 |
9 |
10 | ## What to expect?
11 |
12 | Here is a brief overview of the course:
13 |
14 |
15 |

16 |

17 |
18 |
19 | - Chapters 1 to 4 provide an introduction to the main concepts of the 🤗 Transformers library. By the end of this part of the course, you will be familiar with how Transformer models work and will know how to use a model from the [Hugging Face Hub](https://huggingface.co/models), fine-tune it on a dataset, and share your results on the Hub!
20 | - Chapters 5 to 8 teach the basics of 🤗 Datasets and 🤗 Tokenizers before diving into classic NLP tasks. By the end of this part, you will be able to tackle the most common NLP problems by yourself.
21 | - Chapters 9 to 12 go beyond NLP, and explore how Transformer models can be used tackle tasks in speech processing and computer vision. Along the way, you'll learn how to build and share demos of your models, and optimize them for production environments. By the end of this part, you will be ready to apply 🤗 Transformers to (almost) any machine learning problem!
22 |
23 | This course:
24 |
25 | * Requires a good knowledge of Python
26 | * Is better taken after an introductory deep learning course, such as [fast.ai's](https://www.fast.ai/) [Practical Deep Learning for Coders](https://course.fast.ai/) or one of the programs developed by [DeepLearning.AI](https://www.deeplearning.ai/)
27 | * Does not expect prior [PyTorch](https://pytorch.org/) or [TensorFlow](https://www.tensorflow.org/) knowledge, though some familiarity with either of those will help
28 |
29 | After you've completed this course, we recommend checking out DeepLearning.AI's [Natural Language Processing Specialization](https://www.coursera.org/specializations/natural-language-processing?utm_source=deeplearning-ai&utm_medium=institutions&utm_campaign=20211011-nlp-2-hugging_face-page-nlp-refresh), which covers a wide range of traditional NLP models like naive Bayes and LSTMs that are well worth knowing about!
30 |
31 | ## Who are we?
32 |
33 | About the authors:
34 |
35 | **Matthew Carrigan** is a Machine Learning Engineer at Hugging Face. He lives in Dublin, Ireland and previously worked as an ML engineer at Parse.ly and before that as a post-doctoral researcher at Trinity College Dublin. He does not believe we're going to get to AGI by scaling existing architectures, but has high hopes for robot immortality regardless.
36 |
37 | **Lysandre Debut** is a Machine Learning Engineer at Hugging Face and has been working on the 🤗 Transformers library since the very early development stages. His aim is to make NLP accessible for everyone by developing tools with a very simple API.
38 |
39 | **Sylvain Gugger** is a Research Engineer at Hugging Face and one of the core maintainers of the 🤗 Transformers library. Previously he was a Research Scientist at fast.ai, and he co-wrote _[Deep Learning for Coders with fastai and PyTorch](https://learning.oreilly.com/library/view/deep-learning-for/9781492045519/)_ with Jeremy Howard. The main focus of his research is on making deep learning more accessible, by designing and improving techniques that allow models to train fast on limited resources.
40 |
41 | **Merve Noyan** is a developer advocate at Hugging Face, working on developing tools and building content around them to democratize machine learning for everyone.
42 |
43 | **Lucile Saulnier** is a machine learning engineer at Hugging Face, developing and supporting the use of open source tools. She is also actively involved in many research projects in the field of Natural Language Processing such as collaborative training and BigScience.
44 |
45 | **Lewis Tunstall** is a machine learning engineer at Hugging Face, focused on developing open-source tools and making them accessible to the wider community. He is also a co-author of an upcoming [O’Reilly book on Transformers](https://www.oreilly.com/library/view/natural-language-processing/9781098103231/).
46 |
47 | **Leandro von Werra** is a machine learning engineer in the open-source team at Hugging Face and also a co-author of the an upcoming [O’Reilly book on Transformers](https://www.oreilly.com/library/view/natural-language-processing/9781098103231/). He has several years of industry experience bringing NLP projects to production by working across the whole machine learning stack..
48 |
49 | Are you ready to roll? In this chapter, you will learn:
50 | * How to use the `pipeline()` function to solve NLP tasks such as text generation and classification
51 | * About the Transformer architecture
52 | * How to distinguish between encoder, decoder, and encoder-decoder architectures and use cases
53 |
--------------------------------------------------------------------------------
/chapters/en/chapter4/4.mdx:
--------------------------------------------------------------------------------
1 | # Building a model card
2 |
3 | The model card is a file which is arguably as important as the model and tokenizer files in a model repository. It is the central definition of the model, ensuring reusability by fellow community members and reproducibility of results, and providing a platform on which other members may build their artifacts.
4 |
5 | Documenting the training and evaluation process helps others understand what to expect of a model — and providing sufficient information regarding the data that was used and the preprocessing and postprocessing that were done ensures that the limitations, biases, and contexts in which the model is and is not useful can be identified and understood.
6 |
7 | Therefore, creating a model card that clearly defines your model is a very important step. Here, we provide some tips that will help you with this. Creating the model card is done through the *README.md* file you saw earlier, which is a Markdown file.
8 |
9 | The "model card" concept originates from a research direction from Google, first shared in the paper ["Model Cards for Model Reporting"](https://arxiv.org/abs/1810.03993) by Margaret Mitchell et al. A lot of information contained here is based on that paper, and we recommend you take a look at it to understand why model cards are so important in a world that values reproducibility, reusability, and fairness.
10 |
11 | The model card usually starts with a very brief, high-level overview of what the model is for, followed by additional details in the following sections:
12 |
13 | - Model description
14 | - Intended uses & limitations
15 | - How to use
16 | - Limitations and bias
17 | - Training data
18 | - Training procedure
19 | - Evaluation results
20 |
21 | Let's take a look at what each of these sections should contain.
22 |
23 | ### Model description
24 |
25 | The model description provides basic details about the model. This includes the architecture, version, if it was introduced in a paper, if an original implementation is available, the author, and general information about the model. Any copyright should be attributed here. General information about training procedures, parameters, and important disclaimers can also be mentioned in this section.
26 |
27 | ### Intended uses & limitations
28 |
29 | Here you describe the use cases the model is intended for, including the languages, fields, and domains where it can be applied. This section of the model card can also document areas that are known to be out of scope for the model, or where it is likely to perform suboptimally.
30 |
31 | ### How to use
32 |
33 | This section should include some examples of how to use the model. This can showcase usage of the `pipeline()` function, usage of the model and tokenizer classes, and any other code you think might be helpful.
34 |
35 | ### Training data
36 |
37 | This part should indicate which dataset(s) the model was trained on. A brief description of the dataset(s) is also welcome.
38 |
39 | ### Training procedure
40 |
41 | In this section you should describe all the relevant aspects of training that are useful from a reproducibility perspective. This includes any preprocessing and postprocessing that were done on the data, as well as details such as the number of epochs the model was trained for, the batch size, the learning rate, and so on.
42 |
43 | ### Variable and metrics
44 |
45 | Here you should describe the metrics you use for evaluation, and the different factors you are mesuring. Mentioning which metric(s) were used, on which dataset and which dataset split, makes it easy to compare you model's performance compared to that of other models. These should be informed by the previous sections, such as the intended users and use cases.
46 |
47 | ### Evaluation results
48 |
49 | Finally, provide an indication of how well the model performs on the evaluation dataset. If the model uses a decision threshold, either provide the decision threshold used in the evaluation, or provide details on evaluation at different thresholds for the intended uses.
50 |
51 | ## Example
52 |
53 | Check out the following for a few examples of well-crafted model cards:
54 |
55 | - [`bert-base-cased`](https://huggingface.co/bert-base-cased)
56 | - [`gpt2`](https://huggingface.co/gpt2)
57 | - [`distilbert`](https://huggingface.co/distilbert-base-uncased)
58 |
59 | More examples from different organizations and companies are available [here](https://github.com/huggingface/model_card/blob/master/examples.md).
60 |
61 | ## Note
62 |
63 | Model cards are not a requirement when publishing models, and you don't need to include all of the sections described above when you make one. However, explicit documentation of the model can only benefit future users, so we recommend that you fill in as many of the sections as possible to the best of your knowledge and ability.
64 |
65 | ## Model card metadata
66 |
67 | If you have done a little exploring of the Hugging Face Hub, you should have seen that some models belong to certain categories: you can filter them by tasks, languages, libraries, and more. The categories a model belongs to are identified according to the metadata you add in the model card header.
68 |
69 | For example, if you take a look at the [`camembert-base` model card](https://huggingface.co/camembert-base/blob/main/README.md), you should see the following lines in the model card header:
70 |
71 | ```
72 | ---
73 | language: fr
74 | license: mit
75 | datasets:
76 | - oscar
77 | ---
78 | ```
79 |
80 | This metadata is parsed by the Hugging Face Hub, which then identifies this model as being a French model, with an MIT license, trained on the Oscar dataset.
81 |
82 | The [full model card specification](https://github.com/huggingface/hub-docs/blame/main/modelcard.md) allows specifying languages, licenses, tags, datasets, metrics, as well as the evaluation results the model obtained when training.
83 |
--------------------------------------------------------------------------------
/chapters/en/chapter0/1.mdx:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 | Welcome to the Hugging Face course! This introduction will guide you through setting up a working environment. If you're just starting the course, we recommend you first take a look at [Chapter 1](/course/chapter1), then come back and set up your environment so you can try the code yourself.
4 |
5 | All the libraries that we'll be using in this course are available as Python packages, so here we'll show you how to set up a Python environment and install the specific libraries you'll need.
6 |
7 | We'll cover two ways of setting up your working environment, using a Colab notebook or a Python virtual environment. Feel free to choose the one that resonates with you the most. For beginners, we strongly recommend that you get started by using a Colab notebook.
8 |
9 | Note that we will not be covering the Windows system. If you're running on Windows, we recommend following along using a Colab notebook. If you're using a Linux distribution or macOS, you can use either approach described here.
10 |
11 | Most of the course relies on you having a Hugging Face account. We recommend creating one now: [create an account](https://huggingface.co/join).
12 |
13 | ## Using a Google Colab notebook
14 |
15 | Using a Colab notebook is the simplest possible setup; boot up a notebook in your browser and get straight to coding!
16 |
17 | If you're not familiar with Colab, we recommend you start by following the [introduction](https://colab.research.google.com/notebooks/intro.ipynb). Colab allows you to use some accelerating hardware, like GPUs or TPUs, and it is free for smaller workloads.
18 |
19 | Once you're comfortable moving around in Colab, create a new notebook and get started with the setup:
20 |
21 |
22 |

23 |
24 |
25 | The next step is to install the libraries that we'll be using in this course. We'll use `pip` for the installation, which is the package manager for Python. In notebooks, you can run system commands by preceding them with the `!` character, so you can install the 🤗 Transformers library as follows:
26 |
27 | ```
28 | !pip install transformers
29 | ```
30 |
31 | You can make sure the package was correctly installed by importing it within your Python runtime:
32 |
33 | ```
34 | import transformers
35 | ```
36 |
37 |
38 |

39 |
40 |
41 | This installs a very light version of 🤗 Transformers. In particular, no specific machine learning frameworks (like PyTorch or TensorFlow) are installed. Since we'll be using a lot of different features of the library, we recommend installing the development version, which comes with all the required dependencies for pretty much any imaginable use case:
42 |
43 | ```
44 | !pip install transformers[sentencepiece]
45 | ```
46 |
47 | This will take a bit of time, but then you'll be ready to go for the rest of the course!
48 |
49 | ## Using a Python virtual environment
50 |
51 | If you prefer to use a Python virtual environment, the first step is to install Python on your system. We recommend following [this guide](https://realpython.com/installing-python/) to get started.
52 |
53 | Once you have Python installed, you should be able to run Python commands in your terminal. You can start by running the following command to ensure that it is correctly installed before proceeding to the next steps: `python --version`. This should print out the Python version now available on your system.
54 |
55 | When running a Python command in your terminal, such as `python --version`, you should think of the program running your command as the "main" Python on your system. We recommend keeping this main installation free of any packages, and using it to create separate environments for each application you work on — this way, each application can have its own dependencies and packages, and you won't need to worry about potential compatibility issues with other applications.
56 |
57 | In Python this is done with [*virtual environments*](https://docs.python.org/3/tutorial/venv.html), which are self-contained directory trees that each contain a Python installation with a particular Python version alongside all the packages the application needs. Creating such a virtual environment can be done with a number of different tools, but we'll use the official Python package for that purpose, which is called [`venv`](https://docs.python.org/3/library/venv.html#module-venv).
58 |
59 | First, create the directory you'd like your application to live in — for example, you might want to make a new directory called *transformers-course* at the root of your home directory:
60 |
61 | ```
62 | mkdir ~/transformers-course
63 | cd ~/transformers-course
64 | ```
65 |
66 | From inside this directory, create a virtual environment using the Python `venv` module:
67 |
68 | ```
69 | python -m venv .env
70 | ```
71 |
72 | You should now have a directory called *.env* in your otherwise empty folder:
73 |
74 | ```
75 | ls -a
76 | ```
77 |
78 | ```out
79 | . .. .env
80 | ```
81 |
82 | You can jump in and out of your virtual environment with the `activate` and `deactivate` scripts:
83 |
84 | ```
85 | # Activate the virtual environment
86 | source .env/bin/activate
87 |
88 | # Deactivate the virtual environment
89 | source .env/bin/deactivate
90 | ```
91 |
92 | You can make sure that the environment is activated by running the `which python` command: if it points to the virtual environment, then you have successfully activated it!
93 |
94 | ```
95 | which python
96 | ```
97 |
98 | ```out
99 | /home//transformers-course/.env/bin/python
100 | ```
101 |
102 | ### Installing dependencies
103 |
104 | As in the previous section on using Google Colab instances, you'll now need to install the packages required to continue. Again, you can install the development version of 🤗 Transformers using the `pip` package manager:
105 |
106 | ```
107 | pip install "transformers[sentencepiece]"
108 | ```
109 |
110 | You're now all set up and ready to go!
111 |
--------------------------------------------------------------------------------
/chapters/es/chapter0/section1.mdx:
--------------------------------------------------------------------------------
1 | Bienvenido al curso de Hugging Face. Esta introducción te guiará en la configuración de un entorno de trabajo. Si acabas de empezar el curso, te recomendamos que primero eches un vistazo al [Capítulo 1](/course/chapter1), y luego vuelvas y configures tu entorno para poder probar el código por ti mismo.
2 |
3 | Todas las bibliotecas que usaremos en este curso están disponibles como paquetes de Python, así que aquí te mostraremos cómo configurar un entorno de Python e instalar las bibliotecas específicas que necesitarás.
4 |
5 | Cubriremos dos formas de configurar tu entorno de trabajo, utilizando un cuaderno Colab o un entorno virtual Python. Siéntete libre de elegir la que más te convenga. Para los principiantes, recomendamos encarecidamente que comiencen utilizando un cuaderno Colab.
6 |
7 | Tenga en cuenta que no vamos a cubrir el sistema Windows. Si está utilizando Windows, le recomendamos que siga utilizando un cuaderno Colab. Si está utilizando una distribución de Linux o macOS, puede utilizar cualquiera de los enfoques descritos aquí.
8 |
9 | La mayor parte del curso depende de que tengas una cuenta de Hugging Face. Te recomendamos que crees una ahora: [crear una cuenta](https://huggingface.co/join).
10 |
11 | ## Uso de un cuaderno Google Colab
12 |
13 | Utilizar un cuaderno Colab es la configuración más sencilla posible; ¡arranca un cuaderno en tu navegador y ponte a codificar directamente!
14 |
15 | Si no estás familiarizado con Colab, te recomendamos que empieces siguiendo la [introducción](https://colab.research.google.com/notebooks/intro.ipynb). Colab te permite utilizar algún hardware de aceleración, como GPUs o TPUs, y es gratuito para cargas de trabajo pequeñas.
16 |
17 | Una vez que te sientas cómodo moviéndote en Colab, crea un nuevo notebook y comienza con la configuración:
18 |
19 |
20 |

21 |
22 |
23 | El siguiente paso es instalar las librerías que usaremos en este curso. Usaremos `pip` para la instalación, que es el gestor de paquetes para Python. En los cuadernos, puedes ejecutar comandos del sistema precediéndolos con el carácter `!`, así que puedes instalar la librería 🤗 Transformers de la siguiente manera:
24 |
25 | ```
26 | !pip install transformers
27 | ```
28 |
29 | Puede asegurarse de que el paquete se ha instalado correctamente importándolo en su tiempo de ejecución de Python:
30 |
31 | ```
32 | import transformers
33 | ```
34 |
35 |
36 |

37 |
38 |
39 | Esto instala una versión muy ligera de 🤗 Transformers. En particular, no se instalan frameworks específicos de deep learning (como PyTorch o TensorFlow). Dado que vamos a utilizar un montón de características diferentes de la biblioteca, se recomienda instalar la versión de desarrollo, que viene con todas las dependencias necesarias para casi cualquier caso de uso imaginable:
40 |
41 | ```
42 | !pip install transformers[sentencepiece]
43 | ```
44 |
45 | Esto te llevará un poco de tiempo, pero luego estarás listo para el resto del curso.
46 |
47 | ## Usar un entorno virtual de Python
48 |
49 | Si prefieres utilizar un entorno virtual de Python, el primer paso es instalar Python en tu sistema. Recomendamos seguir [esta guía](https://realpython.com/installing-python/) para empezar.
50 |
51 | Una vez que tengas Python instalado, deberías poder ejecutar comandos de Python en tu terminal. Puedes empezar ejecutando el siguiente comando para asegurarte de que está correctamente instalado antes de proceder a los siguientes pasos: `python --version`. Esto debería imprimir la versión de Python disponible en tu sistema.
52 |
53 | Cuando ejecutes un comando de Python en tu terminal, como `python --version`, debes pensar en el programa que ejecuta tu comando como el Python "principal" de tu sistema. Recomendamos mantener esta instalación principal libre de paquetes, y usarla para crear entornos separados para cada aplicación en la que trabajes - de esta manera, cada aplicación puede tener sus propias dependencias y paquetes, y no tendrás que preocuparte por posibles problemas de compatibilidad con otras aplicaciones.
54 |
55 | En Python esto se hace con [*entornos virtuales*](https://docs.python.org/3/tutorial/venv.html), que son árboles de directorios autocontenidos que contienen cada uno una instalación de Python con una versión particular de Python junto con todos los paquetes que la aplicación necesita. La creación de un entorno virtual de este tipo puede hacerse con varias herramientas diferentes, pero nosotros utilizaremos el paquete oficial de Python para este fin, que se llama [`venv`](https://docs.python.org/3/library/venv.html#module-venv).
56 |
57 | En primer lugar, crea el directorio en el que te gustaría que viviera tu aplicación - por ejemplo, podrías crear un nuevo directorio llamado *transformers-course* en la raíz de tu directorio personal:
58 |
59 | ```
60 | mkdir ~/transformers-course
61 | cd ~/transformers-course
62 | ```
63 |
64 | Desde este directorio, crea un entorno virtual utilizando el módulo `venv` de Python:
65 |
66 | ```
67 | python -m venv .env
68 | ```
69 |
70 | Ahora debería tener un directorio llamado *.env* en su carpeta, por lo demás vacía:
71 |
72 | ```
73 | ls -a
74 | ```
75 |
76 | ```out
77 | . .. .env
78 | ```
79 |
80 | Puedes entrar y salir de tu entorno virtual con los scripts `activate` y `deactivate`:
81 |
82 | ```
83 | # Activate the virtual environment
84 | source .env/bin/activate
85 |
86 | # Deactivate the virtual environment
87 | source .env/bin/deactivate
88 | ```
89 |
90 | Puedes asegurarte de que el entorno está activado ejecutando el comando `which python`: si apunta al entorno virtual, entonces lo has activado con éxito.
91 |
92 | ```
93 | which python
94 | ```
95 |
96 | ```out
97 | /home//transformers-course/.env/bin/python
98 | ```
99 |
100 | ### Instalación de dependencias
101 |
102 | Al igual que en la sección anterior sobre el uso de las instancias de Google Colab, ahora necesitarás instalar los paquetes necesarios para continuar. De nuevo, puedes instalar la versión de desarrollo de 🤗 Transformers utilizando el gestor de paquetes `pip`:
103 |
104 | ```
105 | pip install "transformers[sentencepiece]"
106 | ```
107 |
108 | Ya está todo preparado y listo para funcionar.
109 |
--------------------------------------------------------------------------------
/chapters/en/chapter8/5.mdx:
--------------------------------------------------------------------------------
1 | # How to write a good issue
2 |
3 |
9 |
10 | When you encounter something that doesn't seem right with one of the Hugging Face libraries, you should definitely let us know so we can fix it (the same goes for any open source library, for that matter). If you are not completely certain whether the bug lies in your own code or one of our libraries, the first place to check is the [forums](https://discuss.huggingface.co/). The community will help you figure this out, and the Hugging Face team also closely watches the discussions there.
11 |
12 |
13 |
14 | When you are sure you have a bug in your hand, the first step is to build a minimal reproducible example.
15 |
16 | ## Creating a minimal reproducible example
17 |
18 | It's very important to isolate the piece of code that produces the bug, as no one in the Hugging Face team is a magician (yet), and they can't fix what they can't see. A minimal reproducible example should, as the name indicates, be reproducible. This means that it should not rely on any external files or data you may have. Try to replace the data you are using with some dummy values that look like your real ones and still produce the same error.
19 |
20 |
21 |
22 | 🚨 Many issues in the 🤗 Transformers repository are unsolved because the data used to reproduce them is not accessible.
23 |
24 |
25 |
26 | Once you have something that is self-contained, you can try to reduce it into even less lines of code, building what we call a _minimal reproducible example_. While this requires a bit more work on your side, you will almost be guaranteed to get help and a fix if you provide a nice, short bug reproducer.
27 |
28 | If you feel comfortable enough, go inspect the source code where your bug happens. You might find a solution to your problem (in which case you can even suggest a pull request to fix it), but more generally, this can help the maintainers better understand the source when they read your report.
29 |
30 | ## Filling out the issue template
31 |
32 | When you file your issue, you will notice there is a template to fill out. We will follow the one for [🤗 Transformers issues](https://github.com/huggingface/transformers/issues/new/choose) here, but the same kind of information will be required if you report an issue in another repository. Don't leave the template blank: taking the time to fill it in will maximize your chances of getting an answer and solving your problem.
33 |
34 | In general, when filing an issue, always stay courteous. This is an open source project, so you are using free software, and no one has any obligation to help you. You may include what you feel is justified criticism in your issue, but then the maintainers may very well take it badly and not be in a rush help you. Make sure you read the [code of conduct](https://github.com/huggingface/transformers/blob/master/CODE_OF_CONDUCT.md) of the project.
35 |
36 | ### Including your environment information
37 |
38 | 🤗 Transformers provides a utility to get all the information we need about your environment. Just type the following in your terminal:
39 |
40 | ```
41 | transformers-cli env
42 | ```
43 |
44 | and you should get something like this:
45 |
46 | ```out
47 | Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.
48 |
49 | - `transformers` version: 4.12.0.dev0
50 | - Platform: Linux-5.10.61-1-MANJARO-x86_64-with-arch-Manjaro-Linux
51 | - Python version: 3.7.9
52 | - PyTorch version (GPU?): 1.8.1+cu111 (True)
53 | - Tensorflow version (GPU?): 2.5.0 (True)
54 | - Flax version (CPU?/GPU?/TPU?): 0.3.4 (cpu)
55 | - Jax version: 0.2.13
56 | - JaxLib version: 0.1.65
57 | - Using GPU in script?:
58 | - Using distributed or parallel set-up in script?:
59 | ```
60 |
61 | You can also add a `!` at the beginning of the `transformers-cli env` command to execute it from a notebook cell, and then copy and paste the result at the beginning of your issue.
62 |
63 | ### Tagging people
64 |
65 | Tagging people by typing an `@` followed by their GitHub handle will send them a notification so they will see your issue and might reply quicker. Use this with moderation, because the people you tag might not appreciate being notified if it's something they have no direct link to. If you have looked at the source files related to your bug, you should tag the last person that made changes at the line you think is responsible for your problem (you can find this information by looking at said line on GitHub, selecting it, then clicking "View git blame").
66 |
67 | Otherwise, the template offers suggestions of people to tag. In general, never tag more than three people!
68 |
69 | ### Including a reproducible example
70 |
71 | If you have managed to create a self-contained example that produces the bug, now is the time to include it! Type a line with three backticks followed by `python`, like this:
72 |
73 | ```
74 | ```python
75 | ```
76 |
77 | then paste in your minimal reproducible example and type a new line with three backticks. This will ensure your code is properly formatted.
78 |
79 | If you didn't manage to create a reproducible example, explain in clear steps how you got to your issue. Include a link to a Google Colab notebook where you got the error if you can. The more information you share, the better able the maintainers will be to reply to you.
80 |
81 | In all cases, you should copy and paste the whole error message you are getting. If you're working in Colab, remember that some of the frames may be automatically collapsed in the stack trace, so make sure you expand them before copying. Like with the code sample, put that error message between two lines with three backticks, so it's properly formatted.
82 |
83 | ### Describing the expected behavior
84 |
85 | Explain in a few lines what you expected to get, so that the maintainers get a full grasp of the problem. This part is generally pretty obvious, so it should fit in one sentence, but in some cases you may have a lot to say.
86 |
87 | ## And then what?
88 |
89 | Once your issue is filed, make sure to quickly check everything looks okay. You can edit the issue if you made a mistake, or even change its title if you realize the problem is different from what you initially thought.
90 |
91 | There is no point pinging people if you don't get an answer. If no one helps you in a few days, it's likely that no one could make sense of your problem. Don't hesitate to go back to the reproducible example. Can you make it shorter and more to the point? If you don't get an answer in a week, you can leave a message gently asking for help, especially if you've edited your issue to include more information on the problem.
92 |
93 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # The Hugging Face Course
2 |
3 | This repo contains the content that's used to create the **[Hugging Face course](https://huggingface.co/course/chapter1/1)**. The course teaches you about applying Transformers to various tasks in natural language processing and beyond. Along the way, you'll learn how to use the [Hugging Face](https://huggingface.co/) ecosystem — [🤗 Transformers](https://github.com/huggingface/transformers), [🤗 Datasets](https://github.com/huggingface/datasets), [🤗 Tokenizers](https://github.com/huggingface/tokenizers), and [🤗 Accelerate](https://github.com/huggingface/accelerate) — as well as the [Hugging Face Hub](https://huggingface.co/models). It's completely free and open-source!
4 |
5 | ## 🌎 Languages and translations
6 |
7 | | Language | Source | Authors |
8 | |:-------------------------------------------------------|:--------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
9 | | [English](https://huggingface.co/course/en/chapter1/1) | [`chapters/en`](https://github.com/huggingface/course/tree/main/chapters/en) | [@sgugger](https://github.com/sgugger), [@lewtun](https://github.com/lewtun), [@LysandreJik](https://github.com/LysandreJik), [@Rocketknight1](https://github.com/Rocketknight1), [@sashavor](https://github.com/sashavor), [@osanseviero](https://github.com/osanseviero), [@SaulLu](https://github.com/SaulLu), [@lvwerra](https://github.com/lvwerra) |
10 |
11 | ### Translating the course into your language
12 |
13 | As part of our mission to democratise machine learning, we'd love to have the course available in many more languages! Please follow the steps below if you'd like to help translate the course into your language 🙏.
14 |
15 | **🗞️ Open an issue**
16 |
17 | To get started, navigate to the [_Issues_](https://github.com/huggingface/course/issues) page of this repo and check if anyone else has opened an issue for your language. If not, open a new issue by selecting the _Translation template_ from the _New issue_ button.
18 |
19 | Once an issue is created, post a comment to indicate which chapters you'd like to work on and we'll add your name to the list.
20 |
21 | **🍴 Fork the repository**
22 |
23 | Next, you'll need to [fork this repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo). You can do this by clicking on the **Fork** button on the top-right corner of this repo's page.
24 |
25 | Once you've forked the repo, you'll want to get the files on your local machine for editing. You can do that by cloning the fork with Git as follows:
26 |
27 | ```bash
28 | git clone https://github.com/YOUR-USERNAME/course
29 | ```
30 |
31 | **📋 Copy-paste the English files with a new language code**
32 |
33 | The course files are organised under a main directory:
34 |
35 | * [`chapters`](https://github.com/huggingface/course/tree/main/chapters): all the text and code snippets associated with the course.
36 |
37 | You'll only need to copy the files in the [`chapters/en`](https://github.com/huggingface/course/tree/main/chapters/en) directory, so first navigate to your fork of the repo and run the following:
38 |
39 | ```bash
40 | cd ~/path/to/course
41 | cp -r chapters/en/CHAPTER-NUMBER chapters/LANG-ID/CHAPTER-NUMBER
42 | ```
43 |
44 | Here, `CHAPTER-NUMBER` refers to the chapter you'd like to work on and `LANG-ID` should be one of the ISO 639-1 or ISO 639-2 language codes -- see [here](https://www.loc.gov/standards/iso639-2/php/code_list.php) for a handy table.
45 |
46 | **✍️ Start translating**
47 |
48 | Now comes the fun part - translating the text! The first thing we recommend is translating the part of the `_toctree.yml` file that corresponds to your chapter. This file is used to render the table of contents on the website and provide the links to the Colab notebooks. The only fields you should change are the `title`, ones -- for example, here are the parts of `_toctree.yml` that we'd translate for [Chapter 0](https://huggingface.co/course/chapter0/1?fw=pt):
49 |
50 | ```yaml
51 | - title: 0. Setup # Translate this!
52 | sections:
53 | - local: chapter0/1 # Do not change this!
54 | title: Introduction # Translate this!
55 | ```
56 |
57 | Once you have translated the `_toctree.yml` file, you can start translating the [MDX](https://mdxjs.com/) files associated with your chapter.
58 |
59 | > 🙋 If the `_toctree.yml` file doesn't yet exist for your language, you can simply create one by copy-pasting from the English version and deleting the sections that aren't related to your chapter. Just make sure it exists in the `chapters/LANG-ID/` directory!
60 |
61 | ## 📔 Jupyter notebooks
62 |
63 | The Jupyter notebooks containing all the code from the course are hosted on the [`huggingface/notebooks`](https://github.com/huggingface/notebooks) repo. If you wish to generate them locally, first install the required dependencies:
64 |
65 | ```bash
66 | python -m pip install -r requirements.txt
67 | ```
68 |
69 | Then run the following script:
70 |
71 | ```bash
72 | python utils/generate_notebooks.py --output_dir nbs
73 | ```
74 |
75 | This script extracts all the code snippets from the English chapters and stores them as notebooks in the `nbs` folder (which is ignored by Git by default).
76 |
77 | ## ✍️ Contributing a new chapter
78 |
79 | > Note: we are not currently accepting community contributions for new chapters. These instructions are for the Hugging Face authors.
80 |
81 | Adding a new chapter to the course is quite simple:
82 |
83 | 1. Create a new directory under `chapters/en/chapterX`, where `chapterX` is the chapter you'd like to add.
84 | 2. Add numbered MDX files `sectionX.mdx` for each section. If you need to include images, place them in the [huggingface-course/documentation-images](https://huggingface.co/datasets/huggingface-course/documentation-images) repository and use the [HTML Images Syntax](https://www.w3schools.com/html/html_images.asp) with the path `https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/{langY}/{chapterX}/{your-image.png}`.
85 | 3. Update the `_toctree.yml` file to include your chapter sections -- this information will render the table of contents on the website. If your section involves both the PyTorch and TensorFlow APIs of `transformers`, make sure you include links to both Colabs in the `colab` field.
86 |
87 | If you get stuck, check out one of the existing chapters -- this will often show you the expected syntax.
88 |
89 | Once you are happy with the content, open a pull request and tag [@lewtun](https://github.com/lewtun) for a review. We recommend adding the first chapter draft as a single pull request -- the team will then provide feedback internally to iterate on the content 🤗!
90 |
91 | ## 🙌 Acknowledgements
92 |
93 | The structure of this repo and README are inspired by the wonderful [Advanced NLP with spaCy](https://github.com/ines/spacy-course) course.
--------------------------------------------------------------------------------
/chapters/en/chapter2/6.mdx:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Putting it all together
4 |
5 | {#if fw === 'pt'}
6 |
7 |
13 |
14 | {:else}
15 |
16 |
22 |
23 | {/if}
24 |
25 | In the last few sections, we've been trying our best to do most of the work by hand. We've explored how tokenizers work and looked at tokenization, conversion to input IDs, padding, truncation, and attention masks.
26 |
27 | However, as we saw in section 2, the 🤗 Transformers API can handle all of this for us with a high-level function that we'll dive into here. When you call your `tokenizer` directly on the sentence, you get back inputs that are ready to pass through your model:
28 |
29 | ```py
30 | from transformers import AutoTokenizer
31 |
32 | checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
33 | tokenizer = AutoTokenizer.from_pretrained(checkpoint)
34 |
35 | sequence = "I've been waiting for a HuggingFace course my whole life."
36 |
37 | model_inputs = tokenizer(sequence)
38 | ```
39 |
40 | Here, the `model_inputs` variable contains everything that's necessary for a model to operate well. For DistilBERT, that includes the input IDs as well as the attention mask. Other models that accept additional inputs will also have those output by the `tokenizer` object.
41 |
42 | As we'll see in some examples below, this method is very powerful. First, it can tokenize a single sequence:
43 |
44 | ```py
45 | sequence = "I've been waiting for a HuggingFace course my whole life."
46 |
47 | model_inputs = tokenizer(sequence)
48 | ```
49 |
50 | It also handles multiple sequences at a time, with no change in the API:
51 |
52 | ```py
53 | sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
54 |
55 | model_inputs = tokenizer(sequences)
56 | ```
57 |
58 | It can pad according to several objectives:
59 |
60 | ```py
61 | # Will pad the sequences up to the maximum sequence length
62 | model_inputs = tokenizer(sequences, padding="longest")
63 |
64 | # Will pad the sequences up to the model max length
65 | # (512 for BERT or DistilBERT)
66 | model_inputs = tokenizer(sequences, padding="max_length")
67 |
68 | # Will pad the sequences up to the specified max length
69 | model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
70 | ```
71 |
72 | It can also truncate sequences:
73 |
74 | ```py
75 | sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
76 |
77 | # Will truncate the sequences that are longer than the model max length
78 | # (512 for BERT or DistilBERT)
79 | model_inputs = tokenizer(sequences, truncation=True)
80 |
81 | # Will truncate the sequences that are longer than the specified max length
82 | model_inputs = tokenizer(sequences, max_length=8, truncation=True)
83 | ```
84 |
85 | The `tokenizer` object can handle the conversion to specific framework tensors, which can then be directly sent to the model. For example, in the following code sample we are prompting the tokenizer to return tensors from the different frameworks — `"pt"` returns PyTorch tensors, `"tf"` returns TensorFlow tensors, and `"np"` returns NumPy arrays:
86 |
87 | ```py
88 | sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
89 |
90 | # Returns PyTorch tensors
91 | model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")
92 |
93 | # Returns TensorFlow tensors
94 | model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")
95 |
96 | # Returns NumPy arrays
97 | model_inputs = tokenizer(sequences, padding=True, return_tensors="np")
98 | ```
99 |
100 | ## Special tokens
101 |
102 | If we take a look at the input IDs returned by the tokenizer, we will see they are a tiny bit different from what we had earlier:
103 |
104 | ```py
105 | sequence = "I've been waiting for a HuggingFace course my whole life."
106 |
107 | model_inputs = tokenizer(sequence)
108 | print(model_inputs["input_ids"])
109 |
110 | tokens = tokenizer.tokenize(sequence)
111 | ids = tokenizer.convert_tokens_to_ids(tokens)
112 | print(ids)
113 | ```
114 |
115 | ```python out
116 | [101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
117 | [1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
118 | ```
119 |
120 | One token ID was added at the beginning, and one at the end. Let's decode the two sequences of IDs above to see what this is about:
121 |
122 | ```py
123 | print(tokenizer.decode(model_inputs["input_ids"]))
124 | print(tokenizer.decode(ids))
125 | ```
126 |
127 | ```python out
128 | "[CLS] i've been waiting for a huggingface course my whole life. [SEP]"
129 | "i've been waiting for a huggingface course my whole life."
130 | ```
131 |
132 | The tokenizer added the special word `[CLS]` at the beginning and the special word `[SEP]` at the end. This is because the model was pretrained with those, so to get the same results for inference we need to add them as well. Note that some models don't add special words, or add different ones; models may also add these special words only at the beginning, or only at the end. In any case, the tokenizer knows which ones are expected and will deal with this for you.
133 |
134 | ## Wrapping up: From tokenizer to model
135 |
136 | Now that we've seen all the individual steps the `tokenizer` object uses when applied on texts, let's see one final time how it can handle multiple sequences (padding!), very long sequences (truncation!), and multiple types of tensors with its main API:
137 |
138 | {#if fw === 'pt'}
139 | ```py
140 | import torch
141 | from transformers import AutoTokenizer, AutoModelForSequenceClassification
142 |
143 | checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
144 | tokenizer = AutoTokenizer.from_pretrained(checkpoint)
145 | model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
146 | sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
147 |
148 | tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
149 | output = model(**tokens)
150 | ```
151 | {:else}
152 | ```py
153 | import tensorflow as tf
154 | from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
155 |
156 | checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
157 | tokenizer = AutoTokenizer.from_pretrained(checkpoint)
158 | model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
159 | sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
160 |
161 | tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="tf")
162 | output = model(**tokens)
163 | ```
164 | {/if}
165 |
--------------------------------------------------------------------------------
/chapters/en/chapter4/6.mdx:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # End-of-chapter quiz
6 |
7 | Let's test what you learned in this chapter!
8 |
9 | ### 1. What are models on the Hub limited to?
10 |
11 |
32 |
33 | ### 2. How can you manage models on the Hub?
34 |
35 | git-lfs for large files.",
48 | correct: true
49 | }
50 | ]}
51 | />
52 |
53 | ### 3. What can you do using the Hugging Face Hub web interface?
54 |
55 |
83 |
84 | ### 4. What is a model card?
85 |
86 |
103 |
104 | ### 5. Which of these objects of the 🤗 Transformers library can be directly shared on the Hub with `push_to_hub()`?
105 |
106 | {#if fw === 'pt'}
107 | push_to_hub method, and using it will push all the tokenizer files (vocabulary, architecture of the tokenizer, etc.) to a given repo. That's not the only right answer, though!",
112 | correct: true
113 | },
114 | {
115 | text: "A model configuration",
116 | explain: "Right! All model configurations have the push_to_hub method, and using it will push them to a given repo. What else can you share?",
117 | correct: true
118 | },
119 | {
120 | text: "A model",
121 | explain: "Correct! All models have the push_to_hub method, and using it will push them and their configuration files to a given repo. That's not all you can share, though.",
122 | correct: true
123 | },
124 | {
125 | text: "A Trainer",
126 | explain: "That's right — the Trainer also implements the push_to_hub method, and using it will upload the model, its configuration, the tokenizer, and a model card draft to a given repo. Try another answer!",
127 | correct: true
128 | }
129 | ]}
130 | />
131 | {:else}
132 | push_to_hub method, and using it will push all the tokenizer files (vocabulary, architecture of the tokenizer, etc.) to a given repo. That's not the only right answer, though!",
137 | correct: true
138 | },
139 | {
140 | text: "A model configuration",
141 | explain: "Right! All model configurations have the push_to_hub method, and using it will push them to a given repo. What else can you share?",
142 | correct: true
143 | },
144 | {
145 | text: "A model",
146 | explain: "Correct! All models have the push_to_hub method, and using it will push them and their configuration files to a given repo. That's not all you can share, though.",
147 | correct: true
148 | },
149 | {
150 | text: "All of the above with a dedicated callback",
151 | explain: "That's right — the PushToHubCallback will regularly send all of those objects to a repo during training.",
152 | correct: true
153 | }
154 | ]}
155 | />
156 | {/if}
157 |
158 | ### 6. What is the first step when using the `push_to_hub()` method or the CLI tools?
159 |
160 |
178 |
179 | ### 7. You're using a model and a tokenizer — how can you upload them to the Hub?
180 |
181 | huggingface_hub utility.",
190 | explain: "Models and tokenizers already benefit from huggingface_hub utilities: no need for additional wrapping!"
191 | },
192 | {
193 | text: "By saving them to disk and calling transformers-cli upload-model",
194 | explain: "The command upload-model does not exist."
195 | }
196 | ]}
197 | />
198 |
199 | ### 8. Which git operations can you do with the `Repository` class?
200 |
201 | git_commit() method is there for that.",
206 | correct: true
207 | },
208 | {
209 | text: "A pull",
210 | explain: "That is the purpose of the git_pull() method.",
211 | correct: true
212 | },
213 | {
214 | text: "A push",
215 | explain: "The method git_push() does this.",
216 | correct: true
217 | },
218 | {
219 | text: "A merge",
220 | explain: "No, that operation will never be possible with this API."
221 | }
222 | ]}
223 | />
224 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/translations.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Translation template
3 | about: 🤝 Translating the course to another language
4 | title: ''
5 | labels: translation
6 | assignees: ''
7 |
8 | ---
9 |
10 |
11 |
12 | Hi there 👋
13 |
14 | Let's translate the course to `YOUR-LANG` so that the whole community can benefit from this resource 🌎!
15 |
16 | Below are the chapters and files that need translating - let us know here if you'd like to translate any and we'll add your name to the list. Once you're finished, open a pull request and tag this issue by including `#issue-number` in the description, where `issue-number` is the number of this issue.
17 |
18 | > 🙋 If you'd like others to help you with the translation, you can also post in our [forums](https://discuss.huggingface.co/c/course/20) or tag [@_lewtun](https://twitter.com/_lewtun) on Twitter to gain some visibility.
19 |
20 | ## Chapters
21 |
22 | **0 - Setup**
23 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter0/1.mdx)
24 |
25 | **1 - Transformer models**
26 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/1.mdx)
27 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/2.mdx)
28 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/3.mdx)
29 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/4.mdx)
30 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/5.mdx)
31 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/6.mdx)
32 | - [ ] [`7.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/7.mdx)
33 | - [ ] [`8.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/8.mdx)
34 | - [ ] [`9.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/9.mdx)
35 | - [ ] [`10.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/10.mdx)
36 |
37 | **2 - Using 🤗 Transformers**
38 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/1.mdx)
39 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/2.mdx)
40 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/3.mdx)
41 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/4.mdx)
42 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/5.mdx)
43 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/6.mdx)
44 | - [ ] [`7.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/7.mdx)
45 | - [ ] [`8.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/8.mdx)
46 |
47 | **3 - Fine-tuning a pretrained model**
48 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter3/1.mdx)
49 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter3/2.mdx)
50 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter3/3.mdx)
51 | - [ ] [`3_tf.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter3/3_tf.mdx)
52 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter3/4.mdx)
53 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter3/5.mdx)
54 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter3/6.mdx)
55 |
56 | **4 - Sharing models and tokenizers**
57 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter4/1.mdx)
58 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter4/2.mdx)
59 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter4/3.mdx)
60 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter4/4.mdx)
61 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter4/5.mdx)
62 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter4/6.mdx)
63 |
64 | **5 - The 🤗 Datasets library**
65 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/1.mdx)
66 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/2.mdx)
67 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/3.mdx)
68 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/4.mdx)
69 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/5.mdx)
70 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/6.mdx)
71 | - [ ] [`7.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/7.mdx)
72 | - [ ] [`8.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/8.mdx)
73 |
74 | **6 - The 🤗 Tokenizers library**
75 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/1.mdx)
76 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/2.mdx)
77 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/3.mdx)
78 | - [ ] [`3b.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/3b.mdx)
79 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/4.mdx)
80 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/5.mdx)
81 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/6.mdx)
82 | - [ ] [`7.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/7.mdx)
83 | - [ ] [`8.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/8.mdx)
84 | - [ ] [`9.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/9.mdx)
85 | - [ ] [`10.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/10.mdx)
86 |
87 | **7 - Main NLP tasks**
88 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/1.mdx)
89 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/2.mdx)
90 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/3.mdx)
91 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/4.mdx)
92 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/5.mdx)
93 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/6.mdx)
94 | - [ ] [`7.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/7.mdx)
95 | - [ ] [`8.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/8.mdx)
96 | - [ ] [`9.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/9.mdx)
97 |
98 | **8 - How to ask for help**
99 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/1.mdx)
100 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/2.mdx)
101 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/3.mdx)
102 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/4.mdx)
103 | - [ ] [`4_tf.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/4_tf.mdx)
104 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/5.mdx)
105 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/6.mdx)
106 | - [ ] [`7.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/7.mdx)
107 |
108 | **Events**
109 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/event/1.mdx)
--------------------------------------------------------------------------------
/chapters/en/chapter6/4.mdx:
--------------------------------------------------------------------------------
1 | # Normalization and pre-tokenization
2 |
3 |
9 |
10 | Before we dive more deeply into the three most common subword tokenization algorithms used with Transformer models (Byte-Pair Encoding [BPE], WordPiece, and Unigram), we'll first take a look at the preprocessing that each tokenizer applies to text. Here's a high-level overview of the steps in the tokenization pipeline:
11 |
12 |
13 |

14 |

15 |
16 |
17 | Before splitting a text into subtokens (according to its model), the tokenizer performs two steps: _normalization_ and _pre-tokenization_.
18 |
19 | ## Normalization
20 |
21 |
22 |
23 | The normalization step involves some general cleanup, such as removing needless whitespace, lowercasing, and/or removing accents. If you're familiar with [Unicode normalization](http://www.unicode.org/reports/tr15/) (such as NFC or NFKC), this is also something the tokenizer may apply.
24 |
25 | The 🤗 Transformers `tokenizer` has an attribute called `backend_tokenizer` that provides access to the underlying tokenizer from the 🤗 Tokenizers library:
26 |
27 | ```py
28 | from transformers import AutoTokenizer
29 |
30 | tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
31 | print(type(tokenizer.backend_tokenizer))
32 | ```
33 |
34 | ```python out
35 |
36 | ```
37 |
38 | The `normalizer` attribute of the `tokenizer` object has a `normalize_str()` method that we can use to see how the normalization is performed:
39 |
40 | ```py
41 | print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
42 | ```
43 |
44 | ```python out
45 | 'hello how are u?'
46 | ```
47 |
48 | In this example, since we picked the `bert-base-uncased` checkpoint, the normalization applied lowercasing and removed the accents.
49 |
50 |
51 |
52 | ✏️ **Try it out!** Load a tokenizer from the `bert-base-cased` checkpoint and pass the same example to it. What are the main differences you can see between the cased and uncased versions of the tokenizer?
53 |
54 |
55 |
56 | ## Pre-tokenization
57 |
58 |
59 |
60 | As we will see in the next sections, a tokenizer cannot be trained on raw text alone. Instead, we first need to split the texts into small entities, like words. That's where the pre-tokenization step comes in. As we saw in [Chapter 2](/course/chapter2), a word-based tokenizer can simply split a raw text into words on whitespace and punctuation. Those words will be the boundaries of the subtokens the tokenizer can learn during its training.
61 |
62 | To see how a fast tokenizer performs pre-tokenization, we can use the `pre_tokenize_str()` method of the `pre_tokenizer` attribute of the `tokenizer` object:
63 |
64 | ```py
65 | tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are you?")
66 | ```
67 |
68 | ```python out
69 | [('Hello', (0, 5)), (',', (5, 6)), ('how', (7, 10)), ('are', (11, 14)), ('you', (16, 19)), ('?', (19, 20))]
70 | ```
71 |
72 | Notice how the tokenizer is already keeping track of the offsets, which is how it can give us the offset mapping we used in the previous section. Here the tokenizer ignores the two spaces and replaces them with just one, but the offset jumps between `are` and `you` to account for that.
73 |
74 | Since we're using a BERT tokenizer, the pre-tokenization involves splitting on whitespace and punctuation. Other tokenizers can have different rules for this step. For example, if we use the GPT-2 tokenizer:
75 |
76 | ```py
77 | tokenizer = AutoTokenizer.from_pretrained("gpt2")
78 | tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are you?")
79 | ```
80 |
81 | it will split on whitespace and punctuation as well, but it will keep the spaces and replace them with a `Ġ` symbol, enabling it to recover the original spaces if we decode the tokens:
82 |
83 | ```python out
84 | [('Hello', (0, 5)), (',', (5, 6)), ('Ġhow', (6, 10)), ('Ġare', (10, 14)), ('Ġ', (14, 15)), ('Ġyou', (15, 19)),
85 | ('?', (19, 20))]
86 | ```
87 |
88 | Also note that unlike the BERT tokenizer, this tokenizer does not ignore the double space.
89 |
90 | For a last example, let's have a look at the T5 tokenizer, which is based on the SentencePiece algorithm:
91 |
92 | ```py
93 | tokenizer = AutoTokenizer.from_pretrained("t5-small")
94 | tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are you?")
95 | ```
96 |
97 | ```python out
98 | [('▁Hello,', (0, 6)), ('▁how', (7, 10)), ('▁are', (11, 14)), ('▁you?', (16, 20))]
99 | ```
100 |
101 | Like the GPT-2 tokenizer, this one keeps spaces and replaces them with a specific token (`_`), but the T5 tokenizer only splits on whitespace, not punctuation. Also note that it added a space by default at the beginning of the sentence (before `Hello`) and ignored the double space between `are` and `you`.
102 |
103 | Now that we've seen a little of how some different tokenizers process text, we can start to explore the underlying algorithms themselves. We'll begin with a quick look at the broadly widely applicable SentencePiece; then, over the next three sections, we'll examine how the three main algorithms used for subword tokenization work.
104 |
105 | ## SentencePiece
106 |
107 | [SentencePiece](https://github.com/google/sentencepiece) is a tokenization algorithm for the preprocessing of text that you can use with any of the models we will see in the next three sections. It considers the text as a sequence of Unicode characters, and replaces spaces with a special character, `▁`. Used in conjunction with the Unigram algorithm (see [section 7](/course/chapter7/7)), it doesn't even require a pre-tokenization step, which is very useful for languages where the space character is not used (like Chinese or Japanese).
108 |
109 | The other main feature of SentencePiece is *reversible tokenization*: since there is no special treatment of spaces, decoding the tokens is done simply by concatenating them and replacing the `_`s with spaces -- this results in the normalized text. As we saw earlier, the BERT tokenizer removes repeating spaces, so its tokenization is not reversible.
110 |
111 | ## Algorithm overview
112 |
113 | In the following sections, we'll dive into the three main subword tokenization algorithms: BPE (used by GPT-2 and others), WordPiece (used for example by BERT), and Unigram (used by T5 and others). Before we get started, here's a quick overview of how they each work. Don't hesitate to come back to this table after reading each of the next sections if it doesn't make sense to you yet.
114 |
115 |
116 | Model | BPE | WordPiece | Unigram
117 | :----:|:---:|:---------:|:------:
118 | Training | Starts from a small vocabulary and learns rules to merge tokens | Starts from a small vocabulary and learns rules to merge tokens | Starts from a large vocabulary and learns rules to remove tokens
119 | Training step | Merges the tokens corresponding to the most common pair | Merges the tokens corresponding to the pair with the best score based on the frequency of the pair, privileging pairs where each individual token is less frequent | Removes all the tokens in the vocabulary that will minimize the loss computed on the whole corpus
120 | Learns | Merge rules and a vocabulary | Just a vocabulary | A vocabulary with a score for each token
121 | Encoding | Splits a word into characters and applies the merges learned during training | Finds the longest subword starting from the beginning that is in the vocabulary, then does the same for the rest of the word | Finds the most likely split into tokens, using the scores learned during training
122 |
123 | Now let's dive into BPE!
--------------------------------------------------------------------------------
/chapters/en/chapter5/2.mdx:
--------------------------------------------------------------------------------
1 | # What if my dataset isn't on the Hub?
2 |
3 |
9 |
10 | You know how to use the [Hugging Face Hub](https://huggingface.co/datasets) to download datasets, but you'll often find yourself working with data that is stored either on your laptop or on a remote server. In this section we'll show you how 🤗 Datasets can be used to load datasets that aren't available on the Hugging Face Hub.
11 |
12 |
13 |
14 | ## Working with local and remote datasets
15 |
16 | 🤗 Datasets provides loading scripts to handle the loading of local and remote datasets. It supports several common data formats, such as:
17 |
18 | | Data format | Loading script | Example |
19 | | :----------------: | :------------: | :-----------------------------------------------------: |
20 | | CSV & TSV | `csv` | `load_dataset("csv", data_files="my_file.csv")` |
21 | | Text files | `text` | `load_dataset("text", data_files="my_file.txt")` |
22 | | JSON & JSON Lines | `json` | `load_dataset("json", data_files="my_file.jsonl")` |
23 | | Pickled DataFrames | `pandas` | `load_dataset("pandas", data_files="my_dataframe.pkl")` |
24 |
25 | As shown in the table, for each data format we just need to specify the type of loading script in the `load_dataset()` function, along with a `data_files` argument that specifies the path to one or more files. Let's start by loading a dataset from local files; later we'll see how to do the same with remote files.
26 |
27 | ## Loading a local dataset
28 |
29 | For this example we'll use the [SQuAD-it dataset](https://github.com/crux82/squad-it/), which is a large-scale dataset for question answering in Italian.
30 |
31 | The training and test splits are hosted on GitHub, so we can download them with a simple `wget` command:
32 |
33 | ```python
34 | !wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
35 | !wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz
36 | ```
37 |
38 | This will download two compressed files called *SQuAD_it-train.json.gz* and *SQuAD_it-test.json.gz*, which we can decompress with the Linux `gzip` command:
39 |
40 | ```python
41 | !gzip -dkv SQuAD_it-*.json.gz
42 | ```
43 |
44 | ```bash
45 | SQuAD_it-test.json.gz: 87.4% -- replaced with SQuAD_it-test.json
46 | SQuAD_it-train.json.gz: 82.2% -- replaced with SQuAD_it-train.json
47 | ```
48 |
49 | We can see that the compressed files have been replaced with _SQuAD_it-train.json_ and _SQuAD_it-text.json_, and that the data is stored in the JSON format.
50 |
51 |
52 |
53 | ✎ If you're wondering why there's a `!` character in the above shell commands, that's because we're running them within a Jupyter notebook. Simply remove the prefix if you want to download and unzip the dataset within a terminal.
54 |
55 |
56 |
57 | To load a JSON file with the `load_dataset()` function, we just need to know if we're dealing with ordinary JSON (similar to a nested dictionary) or JSON Lines (line-separated JSON). Like many question answering datasets, SQuAD-it uses the nested format, with all the text stored in a `data` field. This means we can load the dataset by specifying the `field` argument as follows:
58 |
59 | ```py
60 | from datasets import load_dataset
61 |
62 | squad_it_dataset = load_dataset("json", data_files="SQuAD_it-train.json", field="data")
63 | ```
64 |
65 | By default, loading local files creates a `DatasetDict` object with a `train` split. We can see this by inspecting the `squad_it_dataset` object:
66 |
67 | ```py
68 | squad_it_dataset
69 | ```
70 |
71 | ```python out
72 | DatasetDict({
73 | train: Dataset({
74 | features: ['title', 'paragraphs'],
75 | num_rows: 442
76 | })
77 | })
78 | ```
79 |
80 | This shows us the number of rows and the column names associated with the training set. We can view one of the examples by indexing into the `train` split as follows:
81 |
82 | ```py
83 | squad_it_dataset["train"][0]
84 | ```
85 |
86 | ```python out
87 | {
88 | "title": "Terremoto del Sichuan del 2008",
89 | "paragraphs": [
90 | {
91 | "context": "Il terremoto del Sichuan del 2008 o il terremoto...",
92 | "qas": [
93 | {
94 | "answers": [{"answer_start": 29, "text": "2008"}],
95 | "id": "56cdca7862d2951400fa6826",
96 | "question": "In quale anno si è verificato il terremoto nel Sichuan?",
97 | },
98 | ...
99 | ],
100 | },
101 | ...
102 | ],
103 | }
104 | ```
105 |
106 | Great, we've loaded our first local dataset! But while this worked for the training set, what we really want is to include both the `train` and `test` splits in a single `DatasetDict` object so we can apply `Dataset.map()` functions across both splits at once. To do this, we can provide a dictionary to the `data_files` argument that maps each split name to a file associated with that split:
107 |
108 | ```py
109 | data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
110 | squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
111 | squad_it_dataset
112 | ```
113 |
114 | ```python out
115 | DatasetDict({
116 | train: Dataset({
117 | features: ['title', 'paragraphs'],
118 | num_rows: 442
119 | })
120 | test: Dataset({
121 | features: ['title', 'paragraphs'],
122 | num_rows: 48
123 | })
124 | })
125 | ```
126 |
127 | This is exactly what we wanted. Now, we can apply various preprocessing techniques to clean up the data, tokenize the reviews, and so on.
128 |
129 |
130 |
131 | The `data_files` argument of the `load_dataset()` function is quite flexible and can be either a single file path, a list of file paths, or a dictionary that maps split names to file paths. You can also glob files that match a specified pattern according to the rules used by the Unix shell (e.g., you can glob all the JSON files in a directory as a single split by setting `data_files="*.json"`). See the 🤗 Datasets [documentation](https://huggingface.co/docs/datasets/loading.html#local-and-remote-files) for more details.
132 |
133 |
134 |
135 | The loading scripts in 🤗 Datasets actually support automatic decompression of the input files, so we could have skipped the use of `gzip` by pointing the `data_files` argument directly to the compressed files:
136 |
137 | ```py
138 | data_files = {"train": "SQuAD_it-train.json.gz", "test": "SQuAD_it-test.json.gz"}
139 | squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
140 | ```
141 |
142 | This can be useful if you don't want to manually decompress many GZIP files. The automatic decompression also applies to other common formats like ZIP and TAR, so you just need to point `data_files` to the compressed files and you're good to go!
143 |
144 | Now that you know how to load local files on your laptop or desktop, let's take a look at loading remote files.
145 |
146 | ## Loading a remote dataset
147 |
148 | If you're working as a data scientist or coder in a company, there's a good chance the datasets you want to analyze are stored on some remote server. Fortunately, loading remote files is just as simple as loading local ones! Instead of providing a path to local files, we point the `data_files` argument of `load_dataset()` to one or more URLs where the remote files are stored. For example, for the SQuAD-it dataset hosted on GitHub, we can just point `data_files` to the _SQuAD_it-*.json.gz_ URLs as follows:
149 |
150 | ```py
151 | url = "https://github.com/crux82/squad-it/raw/master/"
152 | data_files = {
153 | "train": url + "SQuAD_it-train.json.gz",
154 | "test": url + "SQuAD_it-test.json.gz",
155 | }
156 | squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
157 | ```
158 |
159 | This returns the same `DatasetDict` object obtained above, but saves us the step of manually downloading and decompressing the _SQuAD_it-*.json.gz_ files. This wraps up our foray into the various ways to load datasets that aren't hosted on the Hugging Face Hub. Now that we've got a dataset to play with, let's get our hands dirty with various data-wrangling techniques!
160 |
161 |
162 |
163 | ✏️ **Try it out!** Pick another dataset hosted on GitHub or the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php) and try loading it both locally and remotely using the techniques introduced above. For bonus points, try loading a dataset that’s stored in a CSV or text format (see the [documentation](https://huggingface.co/docs/datasets/loading.html#local-and-remote-files) for more information on these formats).
164 |
165 |
166 |
167 |
168 |
--------------------------------------------------------------------------------
/chapters/en/chapter8/7.mdx:
--------------------------------------------------------------------------------
1 |
2 |
3 | # End-of-chapter quiz
4 |
5 | Let's test what you learned in this chapter!
6 |
7 | ### 1. In which order should you read a Python traceback?
8 |
9 |
22 |
23 | ### 2. What is a minimal reproducible example?
24 |
25 |
46 |
47 | ### 3. Suppose you try to run the following code, which throws an error:
48 |
49 | ```py
50 | from transformers import GPT3ForSequenceClassification
51 |
52 | # ImportError: cannot import name 'GPT3ForSequenceClassification' from 'transformers' (/Users/lewtun/miniconda3/envs/huggingface/lib/python3.8/site-packages/transformers/__init__.py)
53 | # ---------------------------------------------------------------------------
54 | # ImportError Traceback (most recent call last)
55 | # /var/folders/28/k4cy5q7s2hs92xq7_h89_vgm0000gn/T/ipykernel_30848/333858878.py in
56 | # ----> 1 from transformers import GPT3ForSequenceClassification
57 |
58 | # ImportError: cannot import name 'GPT3ForSequenceClassification' from 'transformers' (/Users/lewtun/miniconda3/envs/huggingface/lib/python3.8/site-packages/transformers/__init__.py)
59 | ```
60 |
61 | Which of the following might be a good choice for the title of a forum topic to ask for help?
62 |
63 | ImportError: cannot import name 'GPT3ForSequenceClassification' from 'transformers' (/Users/lewtun/miniconda3/envs/huggingface/lib/python3.8/site-packages/transformers/__init__.py)",
67 | explain: "Including the last line of the traceback can be descriptive, but this is better reserved for the main body of the topic. Try again!"
68 | },
69 | {
70 | text: "Problem with from transformers import GPT3ForSequenceClassification",
71 | explain: "Try again -- although this provides useful information, it's probably best reserved for the main body of the text.",
72 | },
73 | {
74 | text: "Why can't I import GPT3ForSequenceClassification?",
75 | explain: "Good choice! This title is concise and gives the reader a clue about what might be wrong (i.e., that GPT-3 is not supported in 🤗 Transformers).",
76 | correct: true
77 | },
78 | {
79 | text: "Is GPT-3 supported in 🤗 Transformers?",
80 | explain: "Good one! Using questions as topic titles is a great way to communicate the problem to the community.",
81 | correct: true
82 | }
83 | ]}
84 | />
85 |
86 | ### 4. Suppose you've tried to run `trainer.train()` and are faced with a cryptic error that doesn't tell you exactly where the error is coming from. Which of the following is the first place you should look for errors in your training pipeline?
87 |
88 |
109 |
110 | ### 5. What is the best way to debug a CUDA error?
111 |
112 |
137 |
138 | ### 6. What is the best way to get an issue on GitHub fixed?
139 |
140 |
158 |
159 | ### 7. Why is overfitting to one batch usually a good debugging technique?
160 |
161 |
178 |
179 | ### 8. Why is it a good idea to include details on your compute environment with `transformers-cli env` when creating a new issue in the 🤗 Transformers repo?
180 |
181 |
--------------------------------------------------------------------------------
/chapters/en/chapter1/10.mdx:
--------------------------------------------------------------------------------
1 |
2 |
3 | # End-of-chapter quiz
4 |
5 | This chapter covered a lot of ground! Don't worry if you didn't grasp all the details; the next chapters will help you understand how things work under the hood.
6 |
7 | First, though, let's test what you learned in this chapter!
8 |
9 |
10 | ### 1. Explore the Hub and look for the `roberta-large-mnli` checkpoint. What task does it perform?
11 |
12 |
13 | roberta-large-mnli page."
18 | },
19 | {
20 | text: "Text classification",
21 | explain: "More precisely, it classifies if two sentences are logically linked across three labels (contradiction, neutral, entailment) — a task also called natural language inference.",
22 | correct: true
23 | },
24 | {
25 | text: "Text generation",
26 | explain: "Look again on the roberta-large-mnli page."
27 | }
28 | ]}
29 | />
30 |
31 | ### 2. What will the following code return?
32 |
33 | ```py
34 | from transformers import pipeline
35 |
36 | ner = pipeline("ner", grouped_entities=True)
37 | ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")
38 | ```
39 |
40 | sentiment-analysis pipeline."
45 | },
46 | {
47 | text: "It will return a generated text completing this sentence.",
48 | explain: "This is incorrect — it would be a text-generation pipeline.",
49 | },
50 | {
51 | text: "It will return the words representing persons, organizations or locations.",
52 | explain: "Furthermore, with grouped_entities=True, it will group together the words belonging to the same entity, like \"Hugging Face\".",
53 | correct: true
54 | }
55 | ]}
56 | />
57 |
58 | ### 3. What should replace ... in this code sample?
59 |
60 | ```py
61 | from transformers import pipeline
62 |
63 | filler = pipeline("fill-mask", model="bert-base-cased")
64 | result = filler("...")
65 | ```
66 |
67 | has been waiting for you.",
71 | explain: "This is incorrect. Check out the bert-base-cased model card and try to spot your mistake."
72 | },
73 | {
74 | text: "This [MASK] has been waiting for you.",
75 | explain: "Correct! This model's mask token is [MASK].",
76 | correct: true
77 | },
78 | {
79 | text: "This man has been waiting for you.",
80 | explain: "This is incorrect. This pipeline fills in masked words, so it needs a mask token somewhere."
81 | }
82 | ]}
83 | />
84 |
85 | ### 4. Why will this code fail?
86 |
87 | ```py
88 | from transformers import pipeline
89 |
90 | classifier = pipeline("zero-shot-classification")
91 | result = classifier("This is a course about the Transformers library")
92 | ```
93 |
94 | candidate_labels=[...].",
99 | correct: true
100 | },
101 | {
102 | text: "This pipeline requires several sentences, not just one.",
103 | explain: "This is incorrect, though when properly used, this pipeline can take a list of sentences to process (like all other pipelines)."
104 | },
105 | {
106 | text: "The 🤗 Transformers library is broken, as usual.",
107 | explain: "We won't dignify this answer with a comment!"
108 | },
109 | {
110 | text: "This pipeline requires longer inputs; this one is too short.",
111 | explain: "This is incorrect. Note that a very long text will be truncated when processed by this pipeline."
112 | }
113 | ]}
114 | />
115 |
116 | ### 5. What does "transfer learning" mean?
117 |
118 |
135 |
136 | ### 6. True or false? A language model usually does not need labels for its pretraining.
137 |
138 |
139 | self-supervised, which means the labels are created automatically from the inputs (like predicting the next word or filling in some masked words).",
144 | correct: true
145 | },
146 | {
147 | text: "False",
148 | explain: "This is not the correct answer."
149 | }
150 | ]}
151 | />
152 |
153 | ### 7. Select the sentence that best describes the terms "model," "architecture," and "weights."
154 |
155 |
172 |
173 |
174 | ### 8. Which of these types of models would you use for completing prompts with generated text?
175 |
176 |
193 |
194 | ### 9. Which of those types of models would you use for summarizing texts?
195 |
196 |
213 |
214 | ### 10. Which of these types of models would you use for classifying text inputs according to certain labels?
215 |
216 |
233 |
234 | ### 11. What possible source can the bias observed in a model have?
235 |
236 |
255 |
--------------------------------------------------------------------------------
/chapters/en/chapter5/8.mdx:
--------------------------------------------------------------------------------
1 |
2 |
3 | # End-of-chapter quiz
4 |
5 | This chapter covered a lot of ground! Don't worry if you didn't grasp all the details; the next chapters will help you understand how things work under the hood.
6 |
7 | Before moving on, though, let's test what you learned in this chapter.
8 |
9 | ### 1. The `load_dataset()` function in 🤗 Datasets allows you to load a dataset from which of the following locations?
10 |
11 | data_files argument of load_dataset() to load local datasets.",
16 | correct: true
17 | },
18 | {
19 | text: "The Hugging Face Hub",
20 | explain: "Correct! You can load datasets on the Hub by providing the dataset ID, e.g. load_dataset('emotion').",
21 | correct: true
22 | },
23 | {
24 | text: "A remote server",
25 | explain: "Correct! You can pass URLs to the data_files argument of load_dataset() to load remote files.",
26 | correct: true
27 | },
28 | ]}
29 | />
30 |
31 | ### 2. Suppose you load one of the GLUE tasks as follows:
32 |
33 | ```py
34 | from datasets import load_dataset
35 |
36 | dataset = load_dataset("glue", "mrpc", split="train")
37 | ```
38 |
39 | Which of the following commands will produce a random sample of 50 elements from `dataset`?
40 |
41 | dataset.sample(50)",
45 | explain: "This is incorrect -- there is no Dataset.sample() method."
46 | },
47 | {
48 | text: "dataset.shuffle().select(range(50))",
49 | explain: "Correct! As you saw in this chapter, you first shuffle the dataset and then select the samples from it.",
50 | correct: true
51 | },
52 | {
53 | text: "dataset.select(range(50)).shuffle()",
54 | explain: "This is incorrect -- although the code will run, it will only shuffle the first 50 elements in the dataset."
55 | }
56 | ]}
57 | />
58 |
59 | ### 3. Suppose you have a dataset about household pets called `pets_dataset`, which has a `name` column that denotes the name of each pet. Which of the following approaches would allow you to filter the dataset for all pets whose names start with the letter "L"?
60 |
61 | pets_dataset.filter(lambda x : x['name'].startswith('L'))",
65 | explain: "Correct! Using a Python lambda function for these quick filters is a great idea. Can you think of another solution?",
66 | correct: true
67 | },
68 | {
69 | text: "pets_dataset.filter(lambda x['name'].startswith('L'))",
70 | explain: "This is incorrect -- a lambda function takes the general form lambda *arguments* : *expression*, so you need to provide arguments in this case."
71 | },
72 | {
73 | text: "Create a function like def filter_names(x): return x['name'].startswith('L') and run pets_dataset.filter(filter_names).",
74 | explain: "Correct! Just like with Dataset.map(), you can pass explicit functions to Dataset.filter(). This is useful when you have some complex logic that isn't suitable for a short lambda function. Which of the other solutions would work?",
75 | correct: true
76 | }
77 | ]}
78 | />
79 |
80 | ### 4. What is memory mapping?
81 |
82 |
99 |
100 | ### 5. Which of the following are the main benefits of memory mapping?
101 |
102 |
120 |
121 | ### 6. Why does the following code fail?
122 |
123 | ```py
124 | from datasets import load_dataset
125 |
126 | dataset = load_dataset("allocine", streaming=True, split="train")
127 | dataset[0]
128 | ```
129 |
130 | IterableDataset.",
138 | explain: "Correct! An IterableDataset is a generator, not a container, so you should access its elements using next(iter(dataset)).",
139 | correct: true
140 | },
141 | {
142 | text: "The allocine dataset doesn't have a train split.",
143 | explain: "This is incorrect -- check out the [allocine dataset card](https://huggingface.co/datasets/allocine) on the Hub to see which splits it contains."
144 | }
145 | ]}
146 | />
147 |
148 | ### 7. Which of the following are the main benefits of creating a dataset card?
149 |
150 |
169 |
170 |
171 | ### 8. What is semantic search?
172 |
173 |
191 |
192 | ### 9. For asymmetric semantic search, you usually have:
193 |
194 |
211 |
212 | ### 10. Can I use 🤗 Datasets to load data for use in other domains, like speech processing?
213 |
214 | MNIST dataset on the Hub for a computer vision example."
219 | },
220 | {
221 | text: "Yes",
222 | explain: "Correct! Check out the exciting developments with speech and vision in the 🤗 Transformers library to see how 🤗 Datasets is used in these domains.",
223 | correct : true
224 | },
225 | ]}
226 | />
227 |
--------------------------------------------------------------------------------
/chapters/en/chapter2/3.mdx:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Models
4 |
5 | {#if fw === 'pt'}
6 |
7 |
13 |
14 | {:else}
15 |
16 |
22 |
23 | {/if}
24 |
25 | {#if fw === 'pt'}
26 |
27 | {:else}
28 |
29 | {/if}
30 |
31 | {#if fw === 'pt'}
32 | In this section we'll take a closer look at creating and using a model. We'll use the `AutoModel` class, which is handy when you want to instantiate any model from a checkpoint.
33 |
34 | The `AutoModel` class and all of its relatives are actually simple wrappers over the wide variety of models available in the library. It's a clever wrapper as it can automatically guess the appropriate model architecture for your checkpoint, and then instantiates a model with this architecture.
35 |
36 | {:else}
37 | In this section we'll take a closer look at creating and using a model. We'll use the `TFAutoModel` class, which is handy when you want to instantiate any model from a checkpoint.
38 |
39 | The `TFAutoModel` class and all of its relatives are actually simple wrappers over the wide variety of models available in the library. It's a clever wrapper as it can automatically guess the appropriate model architecture for your checkpoint, and then instantiates a model with this architecture.
40 |
41 | {/if}
42 |
43 | However, if you know the type of model you want to use, you can use the class that defines its architecture directly. Let's take a look at how this works with a BERT model.
44 |
45 | ## Creating a Transformer
46 |
47 | The first thing we'll need to do to initialize a BERT model is load a configuration object:
48 |
49 | {#if fw === 'pt'}
50 | ```py
51 | from transformers import BertConfig, BertModel
52 |
53 | # Building the config
54 | config = BertConfig()
55 |
56 | # Building the model from the config
57 | model = BertModel(config)
58 | ```
59 | {:else}
60 | ```py
61 | from transformers import BertConfig, TFBertModel
62 |
63 | # Building the config
64 | config = BertConfig()
65 |
66 | # Building the model from the config
67 | model = TFBertModel(config)
68 | ```
69 | {/if}
70 |
71 | The configuration contains many attributes that are used to build the model:
72 |
73 | ```py
74 | print(config)
75 | ```
76 |
77 | ```python out
78 | BertConfig {
79 | [...]
80 | "hidden_size": 768,
81 | "intermediate_size": 3072,
82 | "max_position_embeddings": 512,
83 | "num_attention_heads": 12,
84 | "num_hidden_layers": 12,
85 | [...]
86 | }
87 | ```
88 |
89 | While you haven't seen what all of these attributes do yet, you should recognize some of them: the `hidden_size` attribute defines the size of the `hidden_states` vector, and `num_hidden_layers` defines the number of layers the Transformer model has.
90 |
91 | ### Different loading methods
92 |
93 | Creating a model from the default configuration initializes it with random values:
94 |
95 | {#if fw === 'pt'}
96 | ```py
97 | from transformers import BertConfig, BertModel
98 |
99 | config = BertConfig()
100 | model = BertModel(config)
101 |
102 | # Model is randomly initialized!
103 | ```
104 | {:else}
105 | ```py
106 | from transformers import BertConfig, TFBertModel
107 |
108 | config = BertConfig()
109 | model = TFBertModel(config)
110 |
111 | # Model is randomly initialized!
112 | ```
113 | {/if}
114 |
115 | The model can be used in this state, but it will output gibberish; it needs to be trained first. We could train the model from scratch on the task at hand, but as you saw in [Chapter 1](/course/chapter1), this would require a long time and a lot of data, and it would have a non-negligible environmental impact. To avoid unnecessary and duplicated effort, it's imperative to be able to share and reuse models that have already been trained.
116 |
117 | Loading a Transformer model that is already trained is simple — we can do this using the `from_pretrained()` method:
118 |
119 | {#if fw === 'pt'}
120 | ```py
121 | from transformers import BertModel
122 |
123 | model = BertModel.from_pretrained("bert-base-cased")
124 | ```
125 |
126 | As you saw earlier, we could replace `BertModel` with the equivalent `AutoModel` class. We'll do this from now on as this produces checkpoint-agnostic code; if your code works for one checkpoint, it should work seamlessly with another. This applies even if the architecture is different, as long as the checkpoint was trained for a similar task (for example, a sentiment analysis task).
127 |
128 | {:else}
129 | ```py
130 | from transformers import TFBertModel
131 |
132 | model = TFBertModel.from_pretrained("bert-base-cased")
133 | ```
134 |
135 | As you saw earlier, we could replace `TFBertModel` with the equivalent `TFAutoModel` class. We'll do this from now on as this produces checkpoint-agnostic code; if your code works for one checkpoint, it should work seamlessly with another. This applies even if the architecture is different, as long as the checkpoint was trained for a similar task (for example, a sentiment analysis task).
136 |
137 | {/if}
138 |
139 | In the code sample above we didn't use `BertConfig`, and instead loaded a pretrained model via the `bert-base-cased` identifier. This is a model checkpoint that was trained by the authors of BERT themselves; you can find more details about it in its [model card](https://huggingface.co/bert-base-cased).
140 |
141 | This model is now initialized with all the weights of the checkpoint. It can be used directly for inference on the tasks it was trained on, and it can also be fine-tuned on a new task. By training with pretrained weights rather than from scratch, we can quickly achieve good results.
142 |
143 | The weights have been downloaded and cached (so future calls to the `from_pretrained()` method won't re-download them) in the cache folder, which defaults to *~/.cache/huggingface/transformers*. You can customize your cache folder by setting the `HF_HOME` environment variable.
144 |
145 | The identifier used to load the model can be the identifier of any model on the Model Hub, as long as it is compatible with the BERT architecture. The entire list of available BERT checkpoints can be found [here](https://huggingface.co/models?filter=bert).
146 |
147 | ### Saving methods
148 |
149 | Saving a model is as easy as loading one — we use the `save_pretrained()` method, which is analogous to the `from_pretrained()` method:
150 |
151 | ```py
152 | model.save_pretrained("directory_on_my_computer")
153 | ```
154 |
155 | This saves two files to your disk:
156 |
157 | {#if fw === 'pt'}
158 | ```
159 | ls directory_on_my_computer
160 |
161 | config.json pytorch_model.bin
162 | ```
163 | {:else}
164 | ```
165 | ls directory_on_my_computer
166 |
167 | config.json tf_model.h5
168 | ```
169 | {/if}
170 |
171 | If you take a look at the *config.json* file, you'll recognize the attributes necessary to build the model architecture. This file also contains some metadata, such as where the checkpoint originated and what 🤗 Transformers version you were using when you last saved the checkpoint.
172 |
173 | {#if fw === 'pt'}
174 | The *pytorch_model.bin* file is known as the *state dictionary*; it contains all your model's weights. The two files go hand in hand; the configuration is necessary to know your model's architecture, while the model weights are your model's parameters.
175 |
176 | {:else}
177 | The *tf_model.h5* file is known as the *state dictionary*; it contains all your model's weights. The two files go hand in hand; the configuration is necessary to know your model's architecture, while the model weights are your model's parameters.
178 |
179 | {/if}
180 |
181 | ## Using a Transformer model for inference
182 |
183 | Now that you know how to load and save a model, let's try using it to make some predictions. Transformer models can only process numbers — numbers that the tokenizer generates. But before we discuss tokenizers, let's explore what inputs the model accepts.
184 |
185 | Tokenizers can take care of casting the inputs to the appropriate framework's tensors, but to help you understand what's going on, we'll take a quick look at what must be done before sending the inputs to the model.
186 |
187 | Let's say we have a couple of sequences:
188 |
189 | ```py
190 | sequences = ["Hello!", "Cool.", "Nice!"]
191 | ```
192 |
193 | The tokenizer converts these to vocabulary indices which are typically called *input IDs*. Each sequence is now a list of numbers! The resulting output is:
194 |
195 | ```py no-format
196 | encoded_sequences = [
197 | [101, 7592, 999, 102],
198 | [101, 4658, 1012, 102],
199 | [101, 3835, 999, 102],
200 | ]
201 | ```
202 |
203 | This is a list of encoded sequences: a list of lists. Tensors only accept rectangular shapes (think matrices). This "array" is already of rectangular shape, so converting it to a tensor is easy:
204 |
205 | {#if fw === 'pt'}
206 | ```py
207 | import torch
208 |
209 | model_inputs = torch.tensor(encoded_sequences)
210 | ```
211 | {:else}
212 | ```py
213 | import tensorflow as tf
214 |
215 | model_inputs = tf.constant(encoded_sequences)
216 | ```
217 | {/if}
218 |
219 | ### Using the tensors as inputs to the model
220 |
221 | Making use of the tensors with the model is extremely simple — we just call the model with the inputs:
222 |
223 | ```py
224 | output = model(model_inputs)
225 | ```
226 |
227 | While the model accepts a lot of different arguments, only the input IDs are necessary. We'll explain what the other arguments do and when they are required later,
228 | but first we need to take a closer look at the tokenizers that build the inputs that a Transformer model can understand.
229 |
--------------------------------------------------------------------------------
/chapters/en/chapter3/3.mdx:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Fine-tuning a model with the Trainer API
4 |
5 |
11 |
12 |
13 |
14 | 🤗 Transformers provides a `Trainer` class to help you fine-tune any of the pretrained models it provides on your dataset. Once you've done all the data preprocessing work in the last section, you have just a few steps left to define the `Trainer`. The hardest part is likely to be preparing the environment to run `Trainer.train()`, as it will run very slowly on a CPU. If you don't have a GPU set up, you can get access to free GPUs or TPUs on [Google Colab](https://colab.research.google.com/).
15 |
16 | The code examples below assume you have already executed the examples in the previous section. Here is a short summary recapping what you need:
17 |
18 | ```py
19 | from datasets import load_dataset
20 | from transformers import AutoTokenizer, DataCollatorWithPadding
21 |
22 | raw_datasets = load_dataset("glue", "mrpc")
23 | checkpoint = "bert-base-uncased"
24 | tokenizer = AutoTokenizer.from_pretrained(checkpoint)
25 |
26 |
27 | def tokenize_function(example):
28 | return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
29 |
30 |
31 | tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
32 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
33 | ```
34 |
35 | ### Training
36 |
37 | The first step before we can define our `Trainer` is to define a `TrainingArguments` class that will contain all the hyperparameters the `Trainer` will use for training and evaluation. The only argument you have to provide is a directory where the trained model will be saved, as well as the checkpoints along the way. For all the rest, you can leave the defaults, which should work pretty well for a basic fine-tuning.
38 |
39 | ```py
40 | from transformers import TrainingArguments
41 |
42 | training_args = TrainingArguments("test-trainer")
43 | ```
44 |
45 |
46 |
47 | 💡 If you want to automatically upload your model to the Hub during training, pass along `push_to_hub=True` in the `TrainingArguments`. We will learn more about this in [Chapter 4](/course/chapter4/3)
48 |
49 |
50 |
51 | The second step is to define our model. As in the [previous chapter](/course/chapter2), we will use the `AutoModelForSequenceClassification` class, with two labels:
52 |
53 | ```py
54 | from transformers import AutoModelForSequenceClassification
55 |
56 | model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
57 | ```
58 |
59 | You will notice that unlike in [Chapter 2](/course/chapter2), you get a warning after instantiating this pretrained model. This is because BERT has not been pretrained on classifying pairs of sentences, so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been added instead. The warnings indicate that some weights were not used (the ones corresponding to the dropped pretraining head) and that some others were randomly initialized (the ones for the new head). It concludes by encouraging you to train the model, which is exactly what we are going to do now.
60 |
61 | Once we have our model, we can define a `Trainer` by passing it all the objects constructed up to now — the `model`, the `training_args`, the training and validation datasets, our `data_collator`, and our `tokenizer`:
62 |
63 | ```py
64 | from transformers import Trainer
65 |
66 | trainer = Trainer(
67 | model,
68 | training_args,
69 | train_dataset=tokenized_datasets["train"],
70 | eval_dataset=tokenized_datasets["validation"],
71 | data_collator=data_collator,
72 | tokenizer=tokenizer,
73 | )
74 | ```
75 |
76 | Note that when you pass the `tokenizer` as we did here, the default `data_collator` used by the `Trainer` will be a `DataCollatorWithPadding` as defined previously, so you can skip the line `data_collator=data_collator` in this call. It was still important to show you this part of the processing in section 2!
77 |
78 | To fine-tune the model on our dataset, we just have to call the `train()` method of our `Trainer`:
79 |
80 | ```py
81 | trainer.train()
82 | ```
83 |
84 | This will start the fine-tuning (which should take a couple of minutes on a GPU) and report the training loss every 500 steps. It won't, however, tell you how well (or badly) your model is performing. This is because:
85 |
86 | 1. We didn't tell the `Trainer` to evaluate during training by setting `evaluation_strategy` to either `"steps"` (evaluate every `eval_steps`) or `"epoch"` (evaluate at the end of each epoch).
87 | 2. We didn't provide the `Trainer` with a `compute_metrics()` function to calculate a metric during said evaluation (otherwise the evaluation would just have printed the loss, which is not a very intuitive number).
88 |
89 |
90 | ### Evaluation
91 |
92 | Let's see how we can build a useful `compute_metrics()` function and use it the next time we train. The function must take an `EvalPrediction` object (which is a named tuple with a `predictions` field and a `label_ids` field) and will return a dictionary mapping strings to floats (the strings being the names of the metrics returned, and the floats their values). To get some predictions from our model, we can use the `Trainer.predict()` command:
93 |
94 | ```py
95 | predictions = trainer.predict(tokenized_datasets["validation"])
96 | print(predictions.predictions.shape, predictions.label_ids.shape)
97 | ```
98 |
99 | ```python out
100 | (408, 2) (408,)
101 | ```
102 |
103 | The output of the `predict()` method is another named tuple with three fields: `predictions`, `label_ids`, and `metrics`. The `metrics` field will just contain the loss on the dataset passed, as well as some time metrics (how long it took to predict, in total and on average). Once we complete our `compute_metrics()` function and pass it to the `Trainer`, that field will also contain the metrics returned by `compute_metrics()`.
104 |
105 | As you can see, `predictions` is a two-dimensional array with shape 408 x 2 (408 being the number of elements in the dataset we used). Those are the logits for each element of the dataset we passed to `predict()` (as you saw in the [previous chapter](/course/chapter2), all Transformer models return logits). To transform them into predictions that we can compare to our labels, we need to take the index with the maximum value on the second axis:
106 |
107 | ```py
108 | import numpy as np
109 |
110 | preds = np.argmax(predictions.predictions, axis=-1)
111 | ```
112 |
113 | We can now compare those `preds` to the labels. To build our `compute_metric()` function, we will rely on the metrics from the 🤗 Datasets library. We can load the metrics associated with the MRPC dataset as easily as we loaded the dataset, this time with the `load_metric()` function. The object returned has a `compute()` method we can use to do the metric calculation:
114 |
115 | ```py
116 | from datasets import load_metric
117 |
118 | metric = load_metric("glue", "mrpc")
119 | metric.compute(predictions=preds, references=predictions.label_ids)
120 | ```
121 |
122 | ```python out
123 | {'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}
124 | ```
125 |
126 | The exact results you get may vary, as the random initialization of the model head might change the metrics it achieved. Here, we can see our model has an accuracy of 85.78% on the validation set and an F1 score of 89.97. Those are the two metrics used to evaluate results on the MRPC dataset for the GLUE benchmark. The table in the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf) reported an F1 score of 88.9 for the base model. That was the `uncased` model while we are currently using the `cased` model, which explains the better result.
127 |
128 | Wrapping everything together, we get our `compute_metrics()` function:
129 |
130 | ```py
131 | def compute_metrics(eval_preds):
132 | metric = load_metric("glue", "mrpc")
133 | logits, labels = eval_preds
134 | predictions = np.argmax(logits, axis=-1)
135 | return metric.compute(predictions=predictions, references=labels)
136 | ```
137 |
138 | And to see it used in action to report metrics at the end of each epoch, here is how we define a new `Trainer` with this `compute_metrics()` function:
139 |
140 | ```py
141 | training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
142 | model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
143 |
144 | trainer = Trainer(
145 | model,
146 | training_args,
147 | train_dataset=tokenized_datasets["train"],
148 | eval_dataset=tokenized_datasets["validation"],
149 | data_collator=data_collator,
150 | tokenizer=tokenizer,
151 | compute_metrics=compute_metrics,
152 | )
153 | ```
154 |
155 | Note that we create a new `TrainingArguments` with its `evaluation_strategy` set to `"epoch"` and a new model — otherwise, we would just be continuing the training of the model we have already trained. To launch a new training run, we execute:
156 |
157 | ```
158 | trainer.train()
159 | ```
160 |
161 | This time, it will report the validation loss and metrics at the end of each epoch on top of the training loss. Again, the exact accuracy/F1 score you reach might be a bit different from what we found, because of the random head initialization of the model, but it should be in the same ballpark.
162 |
163 | The `Trainer` will work out of the box on multiple GPUs or TPUs and provides lots of options, like mixed-precision training (use `fp16 = True` in your training arguments). We will go over everything it supports in Chapter 10.
164 |
165 | This concludes the introduction to fine-tuning using the `Trainer` API. An example of doing this for most common NLP tasks will be given in Chapter 7, but for now let's look at how to do the same thing in pure PyTorch.
166 |
167 |
168 |
169 | ✏️ **Try it out!** Fine-tune a model on the GLUE SST-2 dataset, using the data processing you did in section 2.
170 |
171 |
172 |
173 |
--------------------------------------------------------------------------------
/chapters/en/event/1.mdx:
--------------------------------------------------------------------------------
1 | # Part 2 Release Event
2 |
3 | For the release of part 2 of the course, we organized a live event with two days of talks before a fine-tuning sprint. If you missed it, you can catch up with the talks which are all listed below!
4 |
5 | ## Day 1: A high-level view of Transformers and how to train them
6 |
7 | **Thomas Wolf:** *Transfer Learning and the birth of the Transformers library*
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 | Thomas Wolf is co-founder and Chief Science Officer of Hugging Face. The tools created by Thomas Wolf and the Hugging Face team are used across more than 5,000 research organisations including Facebook Artificial Intelligence Research, Google Research, DeepMind, Amazon Research, Apple, the Allen Institute for Artificial Intelligence as well as most university departments. Thomas Wolf is the initiator and senior chair of the largest research collaboration that has ever existed in Artificial Intelligence: [“BigScience”](https://bigscience.huggingface.co), as well as a set of widely used [libraries and tools](https://github.com/huggingface/). Thomas Wolf is also a prolific educator, a thought leader in the field of Artificial Intelligence and Natural Language Processing, and a regular invited speaker to conferences all around the world [https://thomwolf.io](https://thomwolf.io).
18 |
19 | **Jay Alammar:** *A gentle visual intro to Transformers models*
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 | Through his popular ML blog, Jay has helped millions of researchers and engineers visually understand machine learning tools and concepts from the basic (ending up in NumPy, Pandas docs) to the cutting-edge (Transformers, BERT, GPT-3).
30 |
31 | **Margaret Mitchell:** *On Values in ML Development*
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 | Margaret Mitchell is a researcher working on Ethical AI, currently focused on the ins and outs of ethics-informed AI development in tech. She has published over 50 papers on natural language generation, assistive technology, computer vision, and AI ethics, and holds multiple patents in the areas of conversation generation and sentiment classification. She previously worked at Google AI as a Staff Research Scientist, where she founded and co-led Google's Ethical AI group, focused on foundational AI ethics research and operationalizing AI ethics Google-internally. Before joining Google, she was a researcher at Microsoft Research, focused on computer vision-to-language generation; and was a postdoc at Johns Hopkins, focused on Bayesian modeling and information extraction. She holds a PhD in Computer Science from the University of Aberdeen and a Master's in computational linguistics from the University of Washington. While earning her degrees, she also worked from 2005-2012 on machine learning, neurological disorders, and assistive technology at Oregon Health and Science University. She has spearheaded a number of workshops and initiatives at the intersections of diversity, inclusion, computer science, and ethics. Her work has received awards from Secretary of Defense Ash Carter and the American Foundation for the Blind, and has been implemented by multiple technology companies. She likes gardening, dogs, and cats.
42 |
43 | **Matthew Watson and Chen Qian:** *NLP workflows with Keras*
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 | Matthew Watson is a machine learning engineer on the Keras team, with a focus on high-level modeling APIs. He studied Computer Graphics during undergrad and a Masters at Stanford University. An almost English major who turned towards computer science, he is passionate about working across disciplines and making NLP accessible to a wider audience.
54 |
55 | Chen Qian is a software engineer from Keras team, with a focus on high-level modeling APIs. Chen got a Master degree of Electrical Engineering from Stanford University, and he is especially interested in simplifying code implementations of ML tasks and large-scale ML.
56 |
57 | **Mark Saroufim:** *How to Train a Model with Pytorch*
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 | Mark Saroufim is a Partner Engineer at Pytorch working on OSS production tools including TorchServe and Pytorch Enterprise. In his past lives, Mark was an Applied Scientist and Product Manager at Graphcore, [yuri.ai](http://yuri.ai/), Microsoft and NASA's JPL. His primary passion is to make programming more fun.
68 |
69 | **Jakob Uszkoreit:** *It Ain't Broke So Don't Fix Let's Break It*
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 | Jakob Uszkoreit is the co-founder of Inceptive. Inceptive designs RNA molecules for vaccines and therapeutics using large-scale deep learning in a tight loop with high throughput experiments with the goal of making RNA-based medicines more accessible, more effective and more broadly applicable. Previously, Jakob worked at Google for more than a decade, leading research and development teams in Google Brain, Research and Search working on deep learning fundamentals, computer vision, language understanding and machine translation.
80 |
81 | ## Day 2: The tools to use
82 |
83 | **Lewis Tunstall:** *Simple Training with the 🤗 Transformers Trainer*
84 |
85 |
86 |
87 |
88 |
89 | Lewis is a machine learning engineer at Hugging Face, focused on developing open-source tools and making them accessible to the wider community. He is also a co-author of an upcoming O’Reilly book on Transformers and you can follow him on Twitter (@_lewtun) for NLP tips and tricks!
90 |
91 | **Matthew Carrigan:** *New TensorFlow Features for 🤗 Transformers and 🤗 Datasets*
92 |
93 |
94 |
95 |
96 |
97 | Matt is responsible for TensorFlow maintenance at Transformers, and will eventually lead a coup against the incumbent PyTorch faction which will likely be co-ordinated via his Twitter account @carrigmat.
98 |
99 | **Lysandre Debut:** *The Hugging Face Hub as a means to collaborate on and share Machine Learning projects*
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 | Lysandre is a Machine Learning Engineer at Hugging Face where he is involved in many open source projects. His aim is to make Machine Learning accessible to everyone by developing powerful tools with a very simple API.
110 |
111 | **Lucile Saulnier:** *Get your own tokenizer with 🤗 Transformers & 🤗 Tokenizers*
112 |
113 |
114 |
115 |
116 |
117 | Lucile is a machine learning engineer at Hugging Face, developing and supporting the use of open source tools. She is also actively involved in many research projects in the field of Natural Language Processing such as collaborative training and BigScience.
118 |
119 | **Sylvain Gugger:** *Supercharge your PyTorch training loop with 🤗 Accelerate*
120 |
121 |
122 |
123 |
124 |
125 | Sylvain is a Research Engineer at Hugging Face and one of the core maintainers of 🤗 Transformers and the developer behind 🤗 Accelerate. He likes making model training more accessible.
126 |
127 | **Merve Noyan:** *Showcase your model demos with 🤗 Spaces*
128 |
129 |
130 |
131 |
132 |
133 | Merve is a developer advocate at Hugging Face, working on developing tools and building content around them to democratize machine learning for everyone.
134 |
135 | **Abubakar Abid:** *Building Machine Learning Applications Fast*
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 | Abubakar Abid is the CEO of [Gradio](www.gradio.app). He received his Bachelor's of Science in Electrical Engineering and Computer Science from MIT in 2015, and his PhD in Applied Machine Learning from Stanford in 2021. In his role as the CEO of Gradio, Abubakar works on making machine learning models easier to demo, debug, and deploy.
146 |
147 | **Mathieu Desvé:** *AWS ML Vision: Making Machine Learning Accessible to all Customers*
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 | Technology enthusiast, maker on my free time. I like challenges and solving problem of clients and users, and work with talented people to learn every day. Since 2004, I work in multiple positions switching from frontend, backend, infrastructure, operations and managements. Try to solve commons technical and managerial issues in agile manner.
158 |
159 | **Philipp Schmid:** *Managed Training with Amazon SageMaker and 🤗 Transformers*
160 |
161 |
162 |
163 |
164 |
165 | Philipp Schmid is a Machine Learning Engineer and Tech Lead at Hugging Face, where he leads the collaboration with the Amazon SageMaker team. He is passionate about democratizing and productionizing cutting-edge NLP models and improving the ease of use for Deep Learning.
--------------------------------------------------------------------------------
/utils/generate_notebooks.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import re
4 | import nbformat
5 | import shutil
6 | import yaml
7 |
8 | from pathlib import Path
9 |
10 | PATH_TO_COURSE = "chapters/en/"
11 |
12 | re_framework_test = re.compile(r"^{#if\s+fw\s+===\s+'([^']+)'}\s*$")
13 | re_framework_else = re.compile(r"^{:else}\s*$")
14 | re_framework_end = re.compile(r"^{/if}\s*$")
15 |
16 | re_html_line = re.compile(r"^<[^>]*/>\s*$")
17 | re_html_tag = re.compile(r"<([^/>]*)>\s*$")
18 |
19 | re_python_code = re.compile(r"^```(?:py|python|py no\-format|python no\-format)\s*$")
20 | re_output_code = re.compile(r"^```(?:py|python)\s+out\s*$")
21 | re_end_code = re.compile(r"^```\s*$")
22 |
23 | frameworks = {"pt": "PyTorch", "tf": "TensorFlow"}
24 |
25 | def read_and_split_frameworks(fname):
26 | """
27 | Read the MDX in fname and creates two versions (if necessary) for each framework.
28 | """
29 | with open(fname, "r") as f:
30 | content = f.readlines()
31 |
32 | contents = {"pt": [], "tf": []}
33 |
34 | differences = False
35 | current_content = []
36 | line_idx = 0
37 | for line in content:
38 | if re_framework_test.search(line) is not None:
39 | differences = True
40 | framework = re_framework_test.search(line).groups()[0]
41 | for key in contents:
42 | contents[key].extend(current_content)
43 | current_content = []
44 | elif re_framework_else.search(line) is not None:
45 | contents[framework].extend(current_content)
46 | current_content = []
47 | framework = "pt" if framework == "tf" else "tf"
48 | elif re_framework_end.search(line) is not None:
49 | contents[framework].extend(current_content)
50 | current_content = []
51 | else:
52 | current_content.append(line)
53 |
54 | if len(current_content) > 0:
55 | for key in contents:
56 | contents[key].extend(current_content)
57 |
58 | if differences:
59 | return {k: "".join(content) for k, content in contents.items()}
60 | else:
61 | return "".join(content)
62 |
63 |
64 | def extract_cells(content):
65 | """
66 | Extract the code/output cells from content.
67 | """
68 | cells = []
69 | current_cell = None
70 | is_output = False
71 | for line in content.split("\n"):
72 | if re_python_code.search(line) is not None:
73 | is_output = False
74 | current_cell = []
75 | elif re_output_code.search(line) is not None:
76 | is_output = True
77 | current_cell = []
78 | elif re_end_code.search(line) is not None and current_cell is not None:
79 | cell = "\n".join(current_cell)
80 | if is_output:
81 | if not isinstance(cells[-1], tuple):
82 | cells[-1] = (cells[-1], cell)
83 | else:
84 | cells.append(cell)
85 | current_cell = None
86 | current_md = []
87 | elif current_cell is not None:
88 | current_cell.append(line)
89 |
90 | return cells
91 |
92 |
93 | def convert_to_nb_cell(cell):
94 | """
95 | Convert some cell (either just code or tuple (code, output)) to a proper notebook cell.
96 | """
97 | nb_cell = {"cell_type": "code", "execution_count": None, "metadata": {}}
98 | if isinstance(cell, tuple):
99 | nb_cell["source"] = cell[0]
100 | nb_cell["outputs"] = [nbformat.notebooknode.NotebookNode({
101 | 'data': {'text/plain': cell[1]},
102 | 'execution_count': None,
103 | 'metadata': {},
104 | 'output_type': 'execute_result',
105 | })]
106 | else:
107 | nb_cell["source"] = cell
108 | nb_cell["outputs"] = []
109 | return nbformat.notebooknode.NotebookNode(nb_cell)
110 |
111 |
112 | def nb_cell(source, code=True):
113 | if not code:
114 | return nbformat.notebooknode.NotebookNode(
115 | {"cell_type": "markdown", "source": source, "metadata": {}}
116 | )
117 | return nbformat.notebooknode.NotebookNode(
118 | {"cell_type": "code", "metadata": {}, "source": source, "execution_count": None, "outputs": []}
119 | )
120 |
121 |
122 | def build_notebook(fname, title, output_dir="."):
123 | """
124 | Build the notebook for fname with a given title in output_dir.
125 | """
126 | sections = read_and_split_frameworks(fname)
127 | sections_with_accelerate = [
128 | "A full training",
129 | "Token classification (PyTorch)",
130 | "Fine-tuning a masked language model (PyTorch)",
131 | "Translation (PyTorch)",
132 | "Summarization (PyTorch)",
133 | "Training a causal language model from scratch (PyTorch)",
134 | "Question answering (PyTorch)",
135 | ]
136 | sections_with_hf_hub = [
137 | "Sharing pretrained models (PyTorch)",
138 | "Sharing pretrained models (TensorFlow)",
139 | "Creating your own dataset",
140 | "Token classification (PyTorch)",
141 | "Token classification (TensorFlow)",
142 | "Training a new tokenizer from an old one",
143 | "Fine-tuning a masked language model (PyTorch)",
144 | "Fine-tuning a masked language model (TensorFlow)",
145 | "Translation (PyTorch)",
146 | "Translation (TensorFlow)",
147 | "Summarization (PyTorch)",
148 | "Summarization (TensorFlow)",
149 | "Training a causal language model from scratch (PyTorch)",
150 | "Training a causal language model from scratch (TensorFlow)",
151 | "Question answering (PyTorch)",
152 | "Question answering (TensorFlow)",
153 | "What to do when you get an error",
154 | ]
155 | sections_with_faiss = ["Semantic search with FAISS (PyTorch)", "Semantic search with FAISS (TensorFlow)"]
156 | stem = Path(fname).stem
157 | if not isinstance(sections, dict):
158 | contents = [sections]
159 | titles = [title]
160 | fnames = [f"{stem}.ipynb"]
161 | else:
162 | contents = []
163 | titles = []
164 | fnames = []
165 | for key, section in sections.items():
166 | contents.append(section)
167 | titles.append(f"{title} ({frameworks[key]})")
168 | fnames.append(f"{stem}_{key}.ipynb")
169 |
170 | for title, content, fname in zip(titles, contents, fnames):
171 | cells = extract_cells(content)
172 | if len(cells) == 0:
173 | continue
174 |
175 | nb_cells = [
176 | nb_cell(f"# {title}", code=False),
177 | nb_cell("Install the Transformers and Datasets libraries to run this notebook.", code=False)
178 | ]
179 |
180 | # Install cell
181 | installs = ["!pip install datasets transformers[sentencepiece]"]
182 | if title in sections_with_accelerate:
183 | installs.append("!pip install accelerate")
184 | installs.append("# To run the training on TPU, you will need to uncomment the followin line:")
185 | installs.append("# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl")
186 | if title in sections_with_hf_hub:
187 | installs.append("!apt install git-lfs")
188 | if title in sections_with_faiss:
189 | installs.append("!pip install faiss-gpu")
190 |
191 | nb_cells.append(nb_cell("\n".join(installs)))
192 |
193 | if title in sections_with_hf_hub:
194 | nb_cells.extend([
195 | nb_cell("You will need to setup git, adapt your email and name in the following cell.", code=False),
196 | nb_cell("!git config --global user.email \"you@example.com\"\n!git config --global user.name \"Your Name\""),
197 | nb_cell("You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.", code=False),
198 | nb_cell("from huggingface_hub import notebook_login\n\nnotebook_login()"),
199 | ])
200 | nb_cells += [convert_to_nb_cell(cell) for cell in cells]
201 | metadata = {"colab": {"name": title, "provenance": []}}
202 | nb_dict = {"cells": nb_cells, "metadata": metadata, "nbformat": 4, "nbformat_minor": 4}
203 | notebook = nbformat.notebooknode.NotebookNode(nb_dict)
204 | os.makedirs(output_dir, exist_ok=True)
205 | nbformat.write(notebook, os.path.join(output_dir, fname), version=4)
206 |
207 |
208 | def get_titles():
209 | """
210 | Parse the yaml _chapters.yml to get the correspondence filename to title
211 | """
212 | table = yaml.safe_load(open(os.path.join(PATH_TO_COURSE, "_chapters.yml"), "r"))
213 | result = {}
214 | for entry in table:
215 | chapter_name = entry["local"]
216 | sections = []
217 | for i, section in enumerate(entry["sections"]):
218 | if isinstance(section, str):
219 | result[os.path.join(chapter_name, f"section{i+1}")] = section
220 | else:
221 | section_name = section["local"]
222 | section_title = section["title"]
223 | if isinstance(section_name, str):
224 | result[os.path.join(chapter_name, section_name)] = section_title
225 | else:
226 | if isinstance(section_title, str):
227 | section_title = {key: section_title for key in section_name.keys()}
228 | for key in section_name.keys():
229 | result[os.path.join(chapter_name, section_name[key])] = section_title[key]
230 | return {k: v for k, v in result.items() if "quiz" not in v}
231 |
232 |
233 | def create_notebooks(output_dir):
234 | for folder in os.listdir(output_dir):
235 | if folder.startswith("chapter"):
236 | shutil.rmtree(os.path.join(output_dir, folder))
237 | titles = get_titles()
238 | for fname, title in titles.items():
239 | build_notebook(
240 | os.path.join(PATH_TO_COURSE, f"{fname}.mdx"),
241 | title,
242 | os.path.join(output_dir, Path(fname).parent),
243 | )
244 |
245 |
246 | if __name__ == "__main__":
247 | parser = argparse.ArgumentParser()
248 | parser.add_argument("--output_dir", type=str, help="Where to output the notebooks")
249 | args = parser.parse_args()
250 |
251 | create_notebooks(args.output_dir)
252 |
--------------------------------------------------------------------------------
/chapters/en/chapter3/3_tf.mdx:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Fine-tuning a model with Keras
4 |
5 |
11 |
12 | Once you've done all the data preprocessing work in the last section, you have just a few steps left to train the model. Note, however, that the `model.fit()` command will run very slowly on a CPU. If you don't have a GPU set up, you can get access to free GPUs or TPUs on [Google Colab](https://colab.research.google.com/).
13 |
14 | The code examples below assume you have already executed the examples in the previous section. Here is a short summary recapping what you need:
15 |
16 | ```py
17 | from datasets import load_dataset
18 | from transformers import AutoTokenizer, DataCollatorWithPadding
19 | import numpy as np
20 |
21 | raw_datasets = load_dataset("glue", "mrpc")
22 | checkpoint = "bert-base-uncased"
23 | tokenizer = AutoTokenizer.from_pretrained(checkpoint)
24 |
25 |
26 | def tokenize_function(example):
27 | return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
28 |
29 |
30 | tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
31 |
32 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
33 |
34 | tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
35 | columns=["attention_mask", "input_ids", "token_type_ids"],
36 | label_cols=["labels"],
37 | shuffle=True,
38 | collate_fn=data_collator,
39 | batch_size=8,
40 | )
41 |
42 | tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset(
43 | columns=["attention_mask", "input_ids", "token_type_ids"],
44 | label_cols=["labels"],
45 | shuffle=False,
46 | collate_fn=data_collator,
47 | batch_size=8,
48 | )
49 | ```
50 |
51 | ### Training
52 |
53 | TensorFlow models imported from 🤗 Transformers are already Keras models. Here is a short introduction to Keras.
54 |
55 |
56 |
57 | That means that once we have our data, very little work is required to begin training on it.
58 |
59 |
60 |
61 | As in the [previous chapter](/course/chapter2), we will use the `TFAutoModelForSequenceClassification` class, with two labels:
62 |
63 | ```py
64 | from transformers import TFAutoModelForSequenceClassification
65 |
66 | model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
67 | ```
68 |
69 | You will notice that unlike in [Chapter 2](/course/chapter2), you get a warning after instantiating this pretrained model. This is because BERT has not been pretrained on classifying pairs of sentences, so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been inserted instead. The warnings indicate that some weights were not used (the ones corresponding to the dropped pretraining head) and that some others were randomly initialized (the ones for the new head). It concludes by encouraging you to train the model, which is exactly what we are going to do now.
70 |
71 | To fine-tune the model on our dataset, we just have to `compile()` our model and then pass our data to the `fit()` method. This will start the fine-tuning process (which should take a couple of minutes on a GPU) and report training loss as it goes, plus the validation loss at the end of each epoch.
72 |
73 |
74 |
75 | Note that 🤗 Transformers models have a special ability that most Keras models don't - they can automatically use an appropriate loss which they compute internally. They will use this loss by default if you don't set a loss argument in `compile()`. Note that to use the internal loss you'll need to pass your labels as part of the input, not as a separate label, which is the normal way to use labels with Keras models. You'll see examples of this in Part 2 of the course, where defining the correct loss function can be tricky. For sequence classification, however, a standard Keras loss function works fine, so that's what we'll use here.
76 |
77 |
78 |
79 | ```py
80 | from tensorflow.keras.losses import SparseCategoricalCrossentropy
81 |
82 | model.compile(
83 | optimizer="adam",
84 | loss=SparseCategoricalCrossentropy(from_logits=True),
85 | metrics=["accuracy"],
86 | )
87 | model.fit(
88 | tf_train_dataset,
89 | validation_data=tf_validation_dataset,
90 | )
91 | ```
92 |
93 |
94 |
95 | Note a very common pitfall here — you *can* just pass the name of the loss as a string to Keras, but by default Keras will assume that you have already applied a softmax to your outputs. Many models, however, output the values right before the softmax is applied, which are also known as the *logits*. We need to tell the loss function that that's what our model does, and the only way to do that is to call it directly, rather than by name with a string.
96 |
97 |
98 |
99 |
100 | ### Improving training performance
101 |
102 |
103 |
104 | If you try the above code, it certainly runs, but you'll find that the loss declines only slowly or sporadically. The primary cause
105 | is the *learning rate*. As with the loss, when we pass Keras the name of an optimizer as a string, Keras initializes
106 | that optimizer with default values for all parameters, including learning rate. From long experience, though, we know
107 | that transformer models benefit from a much lower learning rate than the default for Adam, which is 1e-3, also written
108 | as 10 to the power of -3, or 0.001. 5e-5 (0.00005), which is some twenty times lower, is a much better starting point.
109 |
110 | In addition to lowering the learning rate, we have a second trick up our sleeve: We can slowly reduce the learning rate
111 | over the course of training. In the literature, you will sometimes see this referred to as *decaying* or *annealing*
112 | the learning rate. In Keras, the best way to do this is to use a *learning rate scheduler*. A good one to use is
113 | `PolynomialDecay` — despite the name, with default settings it simply linearly decays the learning rate from the initial
114 | value to the final value over the course of training, which is exactly what we want. In order to use a scheduler correctly,
115 | though, we need to tell it how long training is going to be. We compute that as `num_train_steps` below.
116 |
117 | ```py
118 | from tensorflow.keras.optimizers.schedules import PolynomialDecay
119 |
120 | batch_size = 8
121 | num_epochs = 3
122 | # The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
123 | # by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
124 | # not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
125 | num_train_steps = len(tf_train_dataset) * num_epochs
126 | lr_scheduler = PolynomialDecay(
127 | initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
128 | )
129 | from tensorflow.keras.optimizers import Adam
130 |
131 | opt = Adam(learning_rate=lr_scheduler)
132 | ```
133 |
134 |
135 |
136 | The 🤗 Transformers library also has a `create_optimizer()` function that will create an `AdamW` optimizer with learning rate decay. This is a convenient shortcut that you'll see in detail in future sections of the course.
137 |
138 |
139 |
140 | Now we have our all-new optimizer, and we can try training with it. First, let's reload the model, to reset the changes to the weights from the training run we just did, and then we can compile it with the new optimizer:
141 |
142 | ```py
143 | import tensorflow as tf
144 |
145 | model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
146 | loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
147 | model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])
148 | ```
149 |
150 | Now, we fit again:
151 |
152 | ```py
153 | model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)
154 | ```
155 |
156 |
157 |
158 | 💡 If you want to automatically upload your model to the Hub during training, you can pass along a `PushToHubCallback` in the `model.fit()` method. We will learn more about this in [Chapter 4](/course/chapter4/3)
159 |
160 |
161 |
162 | ### Model predictions
163 |
164 |
165 |
166 |
167 | Training and watching the loss go down is all very nice, but what if we want to actually get outputs from the trained model, either to compute some metrics, or to use the model in production? To do that, we can just use the `predict()` method. This will return the *logits* from the output head of the model, one per class.
168 |
169 | ```py
170 | preds = model.predict(tf_validation_dataset)["logits"]
171 | ```
172 |
173 | We can convert these logits into the model's class predictions by using `argmax` to find the highest logit, which corresponds to the most likely class:
174 |
175 | ```py
176 | class_preds = np.argmax(preds, axis=1)
177 | print(preds.shape, class_preds.shape)
178 | ```
179 |
180 | ```python out
181 | (408, 2) (408,)
182 | ```
183 |
184 | Now, let's use those `preds` to compute some metrics! We can load the metrics associated with the MRPC dataset as easily as we loaded the dataset, this time with the `load_metric()` function. The object returned has a `compute()` method we can use to do the metric calculation:
185 |
186 | ```py
187 | from datasets import load_metric
188 |
189 | metric = load_metric("glue", "mrpc")
190 | metric.compute(predictions=class_preds, references=raw_datasets["validation"]["label"])
191 | ```
192 |
193 | ```python out
194 | {'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}
195 | ```
196 |
197 | The exact results you get may vary, as the random initialization of the model head might change the metrics it achieved. Here, we can see our model has an accuracy of 85.78% on the validation set and an F1 score of 89.97. Those are the two metrics used to evaluate results on the MRPC dataset for the GLUE benchmark. The table in the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf) reported an F1 score of 88.9 for the base model. That was the `uncased` model while we are currently using the `cased` model, which explains the better result.
198 |
199 | This concludes the introduction to fine-tuning using the Keras API. An example of doing this for most common NLP tasks will be given in Chapter 7. If you would like to hone your skills on the Keras API, try to fine-tune a model on the GLUE SST-2 dataset, using the data processing you did in section 2.
200 |
--------------------------------------------------------------------------------
/chapters/en/chapter3/6.mdx:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # End-of-chapter quiz
6 |
7 | Test what you learned in this chapter!
8 |
9 | ### 1. The `emotion` dataset contains Twitter messages labeled with emotions. Search for it in the [Hub](https://huggingface.co/datasets), and read the dataset card. Which of these is not one of its basic emotions?
10 |
11 |
32 |
33 | ### 2. Search for the `ar_sarcasm` dataset in the [Hub](https://huggingface.co/datasets). Which task does it support?
34 |
35 | dataset card!"
45 | },
46 | {
47 | text: "Named entity recognition",
48 | explain: "That's not it — take another look at the dataset card!"
49 | },
50 | {
51 | text: "Question answering",
52 | explain: "Alas, this question was not answered correctly. Try again!"
53 | }
54 | ]}
55 | />
56 |
57 | ### 3. How does the BERT model expect a pair of sentences to be processed?
58 |
59 | [SEP] special token is needed to separate the two sentences, but that's not the only thing!"
64 | },
65 | {
66 | text: "[CLS] Tokens_of_sentence_1 Tokens_of_sentence_2",
67 | explain: "A [CLS] special token is required at the beginning, but that's not the only thing!"
68 | },
69 | {
70 | text: "[CLS] Tokens_of_sentence_1 [SEP] Tokens_of_sentence_2 [SEP]",
71 | explain: "That's correct!",
72 | correct: true
73 | },
74 | {
75 | text: "[CLS] Tokens_of_sentence_1 [SEP] Tokens_of_sentence_2",
76 | explain: "A [CLS] special token is needed at the beginning as well as a [SEP] special token to separate the two sentences, but that's not all!"
77 | }
78 | ]}
79 | />
80 |
81 | {#if fw === 'pt'}
82 | ### 4. What are the benefits of the `Dataset.map()` method?
83 |
84 |
103 |
104 | ### 5. What does dynamic padding mean?
105 |
106 |
123 |
124 | ### 6. What is the purpose of a collate function?
125 |
126 | DataCollatorWithPadding specifically."
131 | },
132 | {
133 | text: "It puts together all the samples in a batch.",
134 | explain: "Correct! You can pass the collate function as an argument of a DataLoader. We used the DataCollatorWithPadding function, which pads all items in a batch so they have the same length.",
135 | correct: true
136 | },
137 | {
138 | text: "It preprocesses the whole dataset.",
139 | explain: "That would be a preprocessing function, not a collate function."
140 | },
141 | {
142 | text: "It truncates the sequences in the dataset.",
143 | explain: "A collate function is involved in handling individual batches, not the whole dataset. If you're interested in truncating, you can use the truncate argument of tokenizer."
144 | }
145 | ]}
146 | />
147 |
148 | ### 7. What happens when you instantiate one of the `AutoModelForXxx` classes with a pretrained language model (such as `bert-base-uncased`) that corresponds to a different task than the one for which it was trained?
149 |
150 | AutoModelForSequenceClassification with bert-base-uncased, we got warnings when instantiating the model. The pretrained head is not used for the sequence classification task, so it's discarded and a new head is instantiated with random weights.",
159 | correct: true
160 | },
161 | {
162 | text: "The head of the pretrained model is discarded.",
163 | explain: "Something else needs to happen. Try again!"
164 | },
165 | {
166 | text: "Nothing, since the model can still be fine-tuned for the different task.",
167 | explain: "The head of the pretrained model was not trained to solve this task, so we should discard the head!"
168 | }
169 | ]}
170 | />
171 |
172 | ### 8. What's the purpose of `TrainingArguments`?
173 |
174 | Trainer.",
178 | explain: "Correct!",
179 | correct: true
180 | },
181 | {
182 | text: "It specifies the size of the model.",
183 | explain: "The model size is defined by the model configuration, not the class TrainingArguments."
184 | },
185 | {
186 | text: "It just contains the hyperparameters used for evaluation.",
187 | explain: "In the example, we specified where the model and its checkpoints will be saved. Try again!"
188 | },
189 | {
190 | text: "It just contains the hyperparameters used for training.",
191 | explain: "In the example, we used an evaluation_strategy as well, so this impacts evaluation. Try again!"
192 | }
193 | ]}
194 | />
195 |
196 | ### 9. Why should you use the 🤗 Accelerate library?
197 |
198 | Trainer, not the 🤗 Accelerate library. Try again!"
207 | },
208 | {
209 | text: "It makes our training loops work on distributed strategies",
210 | explain: "Correct! With 🤗 Accelerate, your training loops will work for multiple GPUs and TPUs.",
211 | correct: true
212 | },
213 | {
214 | text: "It provides more optimization functions.",
215 | explain: "No, the 🤗 Accelerate library does not provide any optimization functions."
216 | }
217 | ]}
218 | />
219 |
220 | {:else}
221 | ### 4. What happens when you instantiate one of the `TFAutoModelForXxx` classes with a pretrained language model (such as `bert-base-uncased`) that corresponds to a different task than the one for which it was trained?
222 |
223 | TFAutoModelForSequenceClassification with bert-base-uncased, we got warnings when instantiating the model. The pretrained head is not used for the sequence classification task, so it's discarded and a new head is instantiated with random weights.",
232 | correct: true
233 | },
234 | {
235 | text: "The head of the pretrained model is discarded.",
236 | explain: "Something else needs to happen. Try again!"
237 | },
238 | {
239 | text: "Nothing, since the model can still be fine-tuned for the different task.",
240 | explain: "The head of the pretrained model was not trained to solve this task, so we should discard the head!"
241 | }
242 | ]}
243 | />
244 |
245 | ### 5. The TensorFlow models from `transformers` are already Keras models. What benefit does this offer?
246 |
247 | TPUStrategy scope, including the initialization of the model."
252 | },
253 | {
254 | text: "You can leverage existing methods such as compile(), fit(), and predict().",
255 | explain: "Correct! Once you have the data, training on it requires very little work.",
256 | correct: true
257 | },
258 | {
259 | text: "You get to learn Keras as well as transformers.",
260 | explain: "Correct, but we're looking for something else :)",
261 | correct: true
262 | },
263 | {
264 | text: "You can easily compute metrics related to the dataset.",
265 | explain: "Keras helps us with training and evaluating the model, not computing dataset-related metrics."
266 | }
267 | ]}
268 | />
269 |
270 | ### 6. How can you define your own custom metric?
271 |
272 | tf.keras.metrics.Metric.",
276 | explain: "Great!",
277 | correct: true
278 | },
279 | {
280 | text: "Using the Keras functional API.",
281 | explain: "Try again!"
282 | },
283 | {
284 | text: "By using a callable with signature metric_fn(y_true, y_pred).",
285 | explain: "Correct!",
286 | correct: true
287 | },
288 | {
289 | text: "By Googling it.",
290 | explain: "That's not the answer we're looking for, but it should help you find it.",
291 | correct: true
292 | }
293 | ]}
294 | />
295 |
296 | {/if}
--------------------------------------------------------------------------------