├── requirements.txt
├── chapters
    ├── es
    │   ├── _toctree.yml
    │   └── chapter0
    │   │   └── section1.mdx
    └── en
    │   ├── chapter6
    │       ├── 9.mdx
    │       ├── 1.mdx
    │       └── 4.mdx
    │   ├── chapter8
    │       ├── 6.mdx
    │       ├── 1.mdx
    │       ├── 5.mdx
    │       └── 7.mdx
    │   ├── chapter4
    │       ├── 5.mdx
    │       ├── 1.mdx
    │       ├── 2.mdx
    │       ├── 4.mdx
    │       └── 6.mdx
    │   ├── chapter2
    │       ├── 7.mdx
    │       ├── 1.mdx
    │       ├── 6.mdx
    │       └── 3.mdx
    │   ├── chapter1
    │       ├── 6.mdx
    │       ├── 5.mdx
    │       ├── 7.mdx
    │       ├── 9.mdx
    │       ├── 2.mdx
    │       ├── 8.mdx
    │       ├── 1.mdx
    │       └── 10.mdx
    │   ├── chapter3
    │       ├── 5.mdx
    │       ├── 1.mdx
    │       ├── 3.mdx
    │       ├── 3_tf.mdx
    │       └── 6.mdx
    │   ├── chapter5
    │       ├── 7.mdx
    │       ├── 1.mdx
    │       ├── 2.mdx
    │       └── 8.mdx
    │   ├── chapter7
    │       ├── 8.mdx
    │       └── 1.mdx
    │   ├── _toctree.yml
    │   ├── chapter0
    │       └── 1.mdx
    │   └── event
    │       └── 1.mdx
├── Makefile
├── .github
    ├── workflows
    │   ├── delete_doc_comment.yml
    │   ├── quality.yml
    │   ├── build_documentation.yml
    │   └── build_pr_documentation.yml
    └── ISSUE_TEMPLATE
    │   └── translations.md
├── upcoming_chapters
    └── en
    │   ├── chapter11.md
    │   ├── chapter12.md
    │   ├── chapter10.md
    │   └── chapter9.md
├── utils
    ├── carbon-config.json
    ├── code_formatter.py
    └── generate_notebooks.py
├── .gitignore
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | nbformat>=5.1.3
2 | PyYAML>=5.4.1
3 | black


--------------------------------------------------------------------------------
/chapters/es/_toctree.yml:
--------------------------------------------------------------------------------
1 | - title: Setup
2 |   sections:
3 |   - Creación de un entorno de trabajo


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: quality style
 2 | 
 3 | # Check code formatting
 4 | quality:
 5 | 	python utils/code_formatter.py --check_only
 6 | 
 7 | # Format code samples automatically and check is there are any problems left that need manual fixing
 8 | style:
 9 | 	python utils/code_formatter.py
10 | 


--------------------------------------------------------------------------------
/.github/workflows/delete_doc_comment.yml:
--------------------------------------------------------------------------------
 1 | name: Delete dev documentation
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     types: [ closed ]
 6 | 
 7 | 
 8 | jobs:
 9 |   delete:
10 |     uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
11 |     with:
12 |       pr_number: ${{ github.event.number }}
13 |       package: course


--------------------------------------------------------------------------------
/upcoming_chapters/en/chapter11.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Chapter 11: A custom training loop'
 3 | description:
 4 |   'But what about my own specific problems?'
 5 | prev: /chapter10
 6 | next: /chapter12
 7 | type: chapter
 8 | id: 11
 9 | ---
10 | 
11 | <exercise id="1" title="Writing your own head for a pretrained model.">
12 | </exercise>
13 | 
14 | <exercise id="2" title="Subclass Trainer for a custom training loop.">
15 | </exercise>
16 | 
17 | 


--------------------------------------------------------------------------------
/.github/workflows/quality.yml:
--------------------------------------------------------------------------------
 1 | name: Quality Check
 2 | 
 3 | on:
 4 |   pull_request:
 5 | 
 6 | jobs:
 7 |   quality:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |     - uses: actions/checkout@v2
11 |     - name: Set up Python 3.6
12 |       uses: actions/setup-python@v2
13 |       with:
14 |         python-version: 3.6
15 |     - name: Install Python dependencies
16 |       run: pip install black
17 |     - name: Run Quality check
18 |       run: make quality


--------------------------------------------------------------------------------
/.github/workflows/build_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Build documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - release
 7 |       - doc-builder*
 8 | 
 9 | jobs:
10 |    build:
11 |     uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
12 |     with:
13 |       commit_sha: ${{ github.sha }}
14 |       package: course
15 |       path_to_docs: course/chapters/en
16 |       additional_args: --not_python_module
17 |     secrets:
18 |       token: ${{ secrets.HUGGINGFACE_PUSH }}


--------------------------------------------------------------------------------
/chapters/en/chapter6/9.mdx:
--------------------------------------------------------------------------------
 1 | # Tokenizers, check!
 2 | 
 3 | Great job finishing this chapter!
 4 | 
 5 | After this deep dive into tokenizers, you should:
 6 | 
 7 | - Be able to train a new tokenizer using an old one as a template
 8 | - Understand how to use offsets to map tokens' positions to their original span of text
 9 | - Know the differences between BPE, WordPiece, and Unigram
10 | - Be able to mix and match the blocks provided by the 🤗 Tokenizers library to build your own tokenizer
11 | - Be able to use that tokenizer inside the 🤗 Transformers library
12 | 


--------------------------------------------------------------------------------
/upcoming_chapters/en/chapter12.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Chapter 12: Contribute to Transformers'
 3 | description:
 4 |   'Giving back'
 5 | prev: /chapter11
 6 | next: null
 7 | type: chapter
 8 | id: 11
 9 | ---
10 | 
11 | <exercise id="1" title="Good first issues.">
12 | </exercise>
13 | 
14 | loprtin rte miondjfnjfs 
15 | 
16 | <exercise id="2" title="Finding issue and help resolve them.">
17 | </exercise>
18 | 
19 | <exercise id="3" title="Adding a new model.">
20 | </exercise>
21 | 
22 | <exercise id="4" title="Contributing a new example script.">
23 | </exercise>
24 | 


--------------------------------------------------------------------------------
/chapters/en/chapter8/6.mdx:
--------------------------------------------------------------------------------
1 | # Part 2 completed!
2 | 
3 | Congratulations, you've made it through the second part of the course! We're actively working on the third one, so subscribe to our [newsletter](https://huggingface.curated.co/) to make sure you don't miss its release.
4 | 
5 | You should now be able to tackle a range of NLP tasks, and fine-tune or pretrain a model on them. Don't forget to share your results with the community on the [Model Hub](https://huggingface.co/models).
6 | 
7 | We can't wait to see what you will build with the knowledge that you've gained!
8 | 


--------------------------------------------------------------------------------
/upcoming_chapters/en/chapter10.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Chapter 10: Speeding up training'
 3 | description:
 4 |   'We need to go faster.'
 5 | prev: /chapter9
 6 | next: /chapter11
 7 | type: chapter
 8 | id: 10
 9 | ---
10 | 
11 | <exercise id="1" title="Deploying a script on several GPUs/TPUs.">
12 | </exercise>
13 | 
14 | <exercise id="2" title="Mixed-precision training.">
15 | </exercise>
16 | 
17 | <exercise id="3" title="Data parallelism vs model parallelism.">
18 | </exercise>
19 | 
20 | <exercise id="4" title="DeepSpeed and Fairscale integrations.">
21 | </exercise>
22 | 
23 | 


--------------------------------------------------------------------------------
/upcoming_chapters/en/chapter9.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Chapter 09: Specialized architectures'
 3 | description:
 4 |   'Become an expert at transformer models.'
 5 | prev: /chapter8
 6 | next: /chapter10
 7 | type: chapter
 8 | id: 9
 9 | ---
10 | 
11 | <exercise id="1" title="Handling long sequences">
12 | </exercise>
13 | 
14 | <exercise id="2" title="Memory efficiency">
15 | </exercise>
16 | 
17 | <exercise id="3" title="Time efficiency">
18 | </exercise>
19 | 
20 | <exercise id="4" title="Distillation">
21 | </exercise>
22 | 
23 | <exercise id="5" title="...">
24 | </exercise>
25 | 


--------------------------------------------------------------------------------
/.github/workflows/build_pr_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Build PR Documentation
 2 | 
 3 | on:
 4 |   pull_request:
 5 | 
 6 | concurrency:
 7 |   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
 8 |   cancel-in-progress: true
 9 | 
10 | jobs:
11 |   build:
12 |     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
13 |     with:
14 |       commit_sha: ${{ github.event.pull_request.head.sha }}
15 |       pr_number: ${{ github.event.number }}
16 |       package: course
17 |       path_to_docs: course/chapters/en
18 |       additional_args: --not_python_module


--------------------------------------------------------------------------------
/utils/carbon-config.json:
--------------------------------------------------------------------------------
1 | {"paddingVertical":"1px","paddingHorizontal":"1px","backgroundImage":null,"backgroundImageSelection":null,"backgroundMode":"color","backgroundColor":"rgba(255,255,255,1)","dropShadow":false,"dropShadowOffsetY":"20px","dropShadowBlurRadius":"68px","theme":"one-light","windowTheme":"none","language":"python","fontFamily":"Fira Code","fontSize":"14px","lineHeight":"152%","windowControls":false,"widthAdjustment":true,"lineNumbers":false,"firstLineNumber":1,"exportSize":"2x","watermark":false,"squaredImage":false,"hiddenCharacters":false,"name":"","width":680,"highlights":{"keyword":"rgba(139,92,246,1)","variable":"rgba(236,72,153,1)","number":"rgba(180,83,9,1)","string":"rgba(80,161,79,1)"}}


--------------------------------------------------------------------------------
/chapters/en/chapter4/5.mdx:
--------------------------------------------------------------------------------
1 | # Part 1 completed!
2 | 
3 | This is the end of the first part of the course! Part 2 will be released on November 15th with a big community event, see more information [here](https://huggingface.co/blog/course-launch-event).
4 | 
5 | You should now be able to fine-tune a pretrained model on a text classification problem (single or pairs of sentences) and upload the result to the Model Hub. To make sure you mastered this first section, you should do exactly that on a problem that interests you (and not necessarily in English if you speak another language)! You can find help in the [Hugging Face forums](https://discuss.huggingface.co/) and share your project in [this topic](https://discuss.huggingface.co/t/share-your-projects/6803) once you're finished.
6 | 
7 | We can't wait to see what you will build with this!
8 | 


--------------------------------------------------------------------------------
/chapters/en/chapter2/7.mdx:
--------------------------------------------------------------------------------
 1 | # Basic usage completed!
 2 | 
 3 | Great job following the course up to here! To recap, in this chapter you:
 4 | 
 5 | - Learned the basic building blocks of a Transformer model.
 6 | - Learned what makes up a tokenization pipeline.
 7 | - Saw how to use a Transformer model in practice.
 8 | - Learned how to leverage a tokenizer to convert text to tensors that are understandable by the model.
 9 | - Set up a tokenizer and a model together to get from text to predictions.
10 | - Learned the limitations of input IDs, and learned about attention masks.
11 | - Played around with versatile and configurable tokenizer methods.
12 | 
13 | From now on, you should be able to freely navigate the 🤗 Transformers docs: the vocabulary will sound familiar, and you've already seen the methods that you'll use the majority of the time.
14 | 


--------------------------------------------------------------------------------
/chapters/en/chapter1/6.mdx:
--------------------------------------------------------------------------------
 1 | # Decoder models
 2 | 
 3 | <Youtube id="d_ixlCubqQw" />
 4 | 
 5 | Decoder models use only the decoder of a Transformer model. At each stage, for a given word the attention layers can only access the words positioned before it in the sentence. These models are often called *auto-regressive models*.
 6 | 
 7 | The pretraining of decoder models usually revolves around predicting the next word in the sentence.
 8 | 
 9 | These models are best suited for tasks involving text generation.
10 | 
11 | Representatives of this family of models include:
12 | 
13 | - [CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)
14 | - [GPT](https://huggingface.co/transformers/model_doc/gpt.html)
15 | - [GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)
16 | - [Transformer XL](https://huggingface.co/transformers/model_doc/transformerxl.html)
17 | 


--------------------------------------------------------------------------------
/chapters/en/chapter3/5.mdx:
--------------------------------------------------------------------------------
 1 | <FrameworkSwitchCourse {fw} />
 2 | 
 3 | # Fine-tuning, Check!
 4 | 
 5 | That was fun! In the first two chapters you learned about models and tokenizers, and now you know how to fine-tune them for your own data. To recap, in this chapter you:
 6 | 
 7 | {#if fw === 'pt'}
 8 | * Learned about datasets in the [Hub](https://huggingface.co/datasets)
 9 | * Learned how to load and preprocess datasets, including using dynamic padding and collators
10 | * Implemented your own fine-tuning and evaluation of a model
11 | * Implemented a lower-level training loop
12 | * Used 🤗 Accelerate to easily adapt your training loop so it works for multiple GPUs or TPUs
13 | 
14 | {:else}
15 | * Learned about datasets in the [Hub](https://huggingface.co/datasets)
16 | * Learned how to load and preprocess datasets
17 | * Learned how to fine-tune and evaluate a model with Keras
18 | * Implemented a custom metric
19 | 
20 | {/if}
21 | 


--------------------------------------------------------------------------------
/chapters/en/chapter5/7.mdx:
--------------------------------------------------------------------------------
 1 | # 🤗 Datasets, check!
 2 | 
 3 | Well, that was quite a tour through the 🤗 Datasets library -- congratulations on making it this far! With the knowledge that you've gained from this chapter, you should be able to:
 4 | 
 5 | - Load datasets from anywhere, be it the Hugging Face Hub, your laptop, or a remote server at your company.
 6 | - Wrangle your data using a mix of the `Dataset.map()` and `Dataset.filter()` functions.
 7 | - Quickly switch between data formats like Pandas and NumPy using `Dataset.set_format()`.
 8 | - Create your very own dataset and push it to the Hugging Face Hub.
 9 | - Embed your documents using a Transformer model and build a semantic search engine using FAISS.
10 | 
11 | In [Chapter 7](/course/chapter7), we'll put all of this to good use as we take a deep dive into the core NLP tasks that Transformer models are great for. Before jumping ahead, though, put your knowledge of 🤗 Datasets to the test with a quick quiz!


--------------------------------------------------------------------------------
/chapters/en/chapter3/1.mdx:
--------------------------------------------------------------------------------
 1 | <FrameworkSwitchCourse {fw} />
 2 | 
 3 | # Introduction
 4 | 
 5 | In [Chapter 2](/course/chapter2) we explored how to use tokenizers and pretrained models to make predictions. But what if you want to fine-tune a pretrained model for your own dataset? That's the topic of this chapter! You will learn:
 6 | 
 7 | {#if fw === 'pt'}
 8 | * How to prepare a large dataset from the Hub
 9 | * How to use the high-level `Trainer` API to fine-tune a model
10 | * How to use a custom training loop
11 | * How to leverage the 🤗 Accelerate library to easily run that custom training loop on any distributed setup
12 | 
13 | {:else}
14 | * How to prepare a large dataset from the Hub
15 | * How to use Keras to fine-tune a model
16 | * How to use Keras to get predictions
17 | * How to use a custom metric
18 | 
19 | {/if}
20 | 
21 | In order to upload your trained checkpoints to the Hugging Face Hub, you will need a huggingface.co account: [create an account](https://huggingface.co/join)


--------------------------------------------------------------------------------
/chapters/en/chapter8/1.mdx:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | Now that you know how to tackle the most common NLP tasks with 🤗 Transformers, you should be able to get started on your own projects! In this chapter we will explore what to do when you hit a problem. You'll learn how to successfully debug your code or your training, and how to ask the community for help if you don't manage to solve the problem by yourself. And if you think you've found a bug in one of the Hugging Face libraries, we'll show you the best way to report it so that the issue is resolved as quickly as possible.
 4 | 
 5 | More precisely, in this chapter you will learn:
 6 | 
 7 | - The first thing to do when you get an error
 8 | - How to ask for help on the [forums](https://discuss.huggingface.co/)
 9 | - How to debug your training pipeline
10 | - How to write a good issue
11 | 
12 | None of this is specifically related to 🤗 Transformers or the Hugging Face ecosystem, of course; the lessons from this chapter are applicable to most open source projects!
13 | 


--------------------------------------------------------------------------------
/chapters/en/chapter5/1.mdx:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | In [Chapter 3](/course/chapter3) you got your first taste of the 🤗 Datasets library and saw that there were three main steps when it came to fine-tuning a model:
 4 | 
 5 | 1. Load a dataset from the Hugging Face Hub.
 6 | 2. Preprocess the data with `Dataset.map()`.
 7 | 3. Load and compute metrics.
 8 | 
 9 | But this is just scratching the surface of what 🤗 Datasets can do! In this chapter, we will take a deep dive into the library. Along the way, we'll find answers to the following questions:
10 | 
11 | * What do you do when your dataset is not on the Hub?
12 | * How can you slice and dice a dataset? (And what if you _really_ need to use Pandas?)
13 | * What do you do when your dataset is huge and will melt your laptop's RAM?
14 | * What the heck are "memory mapping" and Apache Arrow?
15 | * How can you create your own dataset and push it to the Hub?
16 | 
17 | The techniques you learn here will prepare you for the advanced tokenization and fine-tuning tasks in [Chapter 6](/course/chapter6) and [Chapter 7](/course/chapter7) -- so grab a coffee and let's get started!


--------------------------------------------------------------------------------
/chapters/en/chapter1/5.mdx:
--------------------------------------------------------------------------------
 1 | # Encoder models
 2 | 
 3 | <Youtube id="MUqNwgPjJvQ" />
 4 | 
 5 | Encoder models use only the encoder of a Transformer model. At each stage, the attention layers can access all the words in the initial sentence. These models are often characterized as having "bi-directional" attention, and are often called *auto-encoding models*.
 6 | 
 7 | The pretraining of these models usually revolves around somehow corrupting a given sentence (for instance, by masking random words in it) and tasking the model with finding or reconstructing the initial sentence.
 8 | 
 9 | Encoder models are best suited for tasks requiring an understanding of the full sentence, such as sentence classification, named entity recognition (and more generally word classification), and extractive question answering.
10 | 
11 | Representatives of this family of models include:
12 | 
13 | - [ALBERT](https://huggingface.co/transformers/model_doc/albert.html)
14 | - [BERT](https://huggingface.co/transformers/model_doc/bert.html)
15 | - [DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)
16 | - [ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)
17 | - [RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)
18 | 


--------------------------------------------------------------------------------
/chapters/en/chapter1/7.mdx:
--------------------------------------------------------------------------------
 1 | # Sequence-to-sequence models
 2 | 
 3 | <Youtube id="0_4KEb08xrE" />
 4 | 
 5 | Encoder-decoder models (also called *sequence-to-sequence models*) use both parts of the Transformer architecture. At each stage, the attention layers of the encoder can access all the words in the initial sentence, whereas the attention layers of the decoder can only access the words positioned before a given word in the input.
 6 | 
 7 | The pretraining of these models can be done using the objectives of encoder or decoder models, but usually involves something a bit more complex. For instance, [T5](https://huggingface.co/t5-base) is pretrained by replacing random spans of text (that can contain several words) with a single mask special word, and the objective is then to predict the text that this mask word replaces.
 8 | 
 9 | Sequence-to-sequence models are best suited for tasks revolving around generating new sentences depending on a given input, such as summarization, translation, or generative question answering.
10 | 
11 | Representatives of this family of models include:
12 | 
13 | - [BART](https://huggingface.co/transformers/model_doc/bart.html)
14 | - [mBART](https://huggingface.co/transformers/model_doc/mbart.html)
15 | - [Marian](https://huggingface.co/transformers/model_doc/marian.html)
16 | - [T5](https://huggingface.co/transformers/model_doc/t5.html)
17 | 


--------------------------------------------------------------------------------
/chapters/en/chapter1/9.mdx:
--------------------------------------------------------------------------------
 1 | # Summary
 2 | 
 3 | In this chapter, you saw how to approach different NLP tasks using the high-level `pipeline()` function from 🤗 Transformers. You also saw how to search for and use models in the Hub, as well as how to use the Inference API to test the models directly in your browser.
 4 | 
 5 | We discussed how Transformer models work at a high level, and talked about the importance of transfer learning and fine-tuning. A key aspect is that you can use the full architecture or only the encoder or decoder, depending on what kind of task you aim to solve. The following table summarizes this:
 6 | 
 7 | | Model           | Examples                                   | Tasks                                                                            |
 8 | |-----------------|--------------------------------------------|----------------------------------------------------------------------------------|
 9 | | Encoder         | ALBERT, BERT, DistilBERT, ELECTRA, RoBERTa | Sentence classification, named entity recognition, extractive question answering |
10 | | Decoder         | CTRL, GPT, GPT-2, Transformer XL           | Text generation                                                                  |
11 | | Encoder-decoder | BART, T5, Marian, mBART                    | Summarization, translation, generative question answering                        |
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode
 2 | 
 3 | # Logs
 4 | logs
 5 | *.log
 6 | npm-debug.log*
 7 | yarn-debug.log*
 8 | yarn-error.log*
 9 | 
10 | # Runtime data
11 | pids
12 | *.pid
13 | *.seed
14 | *.pid.lock
15 | 
16 | # Directory for instrumented libs generated by jscoverage/JSCover
17 | lib-cov
18 | 
19 | # Coverage directory used by tools like istanbul
20 | coverage
21 | 
22 | # nyc test coverage
23 | .nyc_output
24 | 
25 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
26 | .grunt
27 | 
28 | # Bower dependency directory (https://bower.io/)
29 | bower_components
30 | 
31 | # node-waf configuration
32 | .lock-wscript
33 | 
34 | # Compiled binary addons (http://nodejs.org/api/addons.html)
35 | build/Release
36 | 
37 | # Dependency directories
38 | node_modules/
39 | jspm_packages/
40 | 
41 | # Typescript v1 declaration files
42 | typings/
43 | 
44 | # Optional npm cache directory
45 | .npm
46 | 
47 | # Optional eslint cache
48 | .eslintcache
49 | 
50 | # Optional REPL history
51 | .node_repl_history
52 | 
53 | # Output of 'npm pack'
54 | *.tgz
55 | 
56 | # dotenv environment variables file
57 | .env
58 | 
59 | # gatsby files
60 | .cache/
61 | public
62 | 
63 | # Mac files
64 | .DS_Store
65 | 
66 | # Yarn
67 | yarn-error.log
68 | yarn.lock
69 | .pnp/
70 | .pnp.js
71 | # Yarn Integrity file
72 | .yarn-integrity
73 | 
74 | # Sylvain notes folder
75 | notes
76 | 
77 | # Ignore Colab notebooks
78 | nbs/
79 | 
80 | # Byte-compiled
81 | __pycache__/
82 | .cache/


--------------------------------------------------------------------------------
/chapters/en/chapter7/8.mdx:
--------------------------------------------------------------------------------
 1 | # Mastering NLP
 2 | 
 3 | If you've made it this far in the course, congratulations -- you now have all the knowledge and tools you need to tackle (almost) any NLP task with 🤗 Transformers and the Hugging Face ecosystem!
 4 | 
 5 | We have seen a lot of different data collators, so we made this little video to help you find which one to use for each task:
 6 | 
 7 | <Youtube id="-RPeakdlHYo"/>
 8 | 
 9 | After completing this lightning tour through the core NLP tasks, you should:
10 | 
11 | * Know which architectures (encoder, decoder, or encoder-decoder) are best suited for each task
12 | * Understand the difference between pretraining and fine-tuning a language model
13 | * Know how to train Transformer models using either the `Trainer` API and distributed training features of 🤗 Accelerate or TensorFlow and Keras, depending on which track you've been following
14 | * Understand the meaning and limitations of metrics like ROUGE and BLEU for text generation tasks
15 | * Know how to interact with your fine-tuned models, both on the Hub and using the `pipeline` from 🤗 Transformers
16 | 
17 | Despite all this knowledge, there will come a time when you'll either encounter a difficult bug in your code or have a question about how to solve a particular NLP problem. Fortunately, the Hugging Face community is here to help you! In the final chapter of this part of the course, we'll explore how you can debug your Transformer models and ask for help effectively.


--------------------------------------------------------------------------------
/chapters/en/chapter6/1.mdx:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | In [Chapter 3](/course/chapter3), we looked at how to fine-tune a model on a given task. When we do that, we use the same tokenizer that the model was pretrained with -- but what do we do when we want to train a model from scratch? In these cases, using a tokenizer that was pretrained on a corpus from another domain or language is typically suboptimal. For example, a tokenizer that's trained on an English corpus will perform poorly on a corpus of Japanese texts because the use of spaces and punctuation is very different in the two languages.
 4 | 
 5 | In this chapter, you will learn how to train a brand new tokenizer on a corpus of texts, so it can then be used to pretrain a language model. This will all be done with the help of the [🤗 Tokenizers](https://github.com/huggingface/tokenizers) library, which provides the "fast" tokenizers in the [🤗 Transformers](https://github.com/huggingface/transformers) library. We'll take a close look at the features that this library provides, and explore how the fast tokenizers differ from the "slow" versions.
 6 | 
 7 | Topics we will cover include:
 8 | 
 9 | * How to train a new tokenizer similar to the one used by a given checkpoint on a new corpus of texts
10 | * The special features of fast tokenizers
11 | * The differences between the three main subword tokenization algorithms used in NLP today
12 | * How to build a tokenizer from scratch with the 🤗 Tokenizers library and train it on some data
13 | 
14 | The techniques introduced in this chapter will prepare you for the section in [Chapter 7](/course/chapter7/6) where we look at creating a language model for Python source code. Let's start by looking at what it means to "train" a tokenizer in the first place.


--------------------------------------------------------------------------------
/chapters/en/chapter4/1.mdx:
--------------------------------------------------------------------------------
 1 | # The Hugging Face Hub
 2 | 
 3 | The [Hugging Face Hub](https://huggingface.co/) –- our main website –- is a central platform that enables anyone to discover, use, and contribute new state-of-the-art models and datasets. It hosts a wide variety of models, with more than 10,000 publicly available. We'll focus on the models in this chapter, and take a look at the datasets in Chapter 5.
 4 | 
 5 | The models in the Hub are not limited to 🤗 Transformers or even NLP. There are models from [Flair](https://github.com/flairNLP/flair) and [AllenNLP](https://github.com/allenai/allennlp) for NLP, [Asteroid](https://github.com/asteroid-team/asteroid) and [pyannote](https://github.com/pyannote/pyannote-audio) for speech, and [timm](https://github.com/rwightman/pytorch-image-models) for vision, to name a few. 
 6 | 
 7 | Each of these models is hosted as a Git repository, which allows versioning and reproducibility. Sharing a model on the Hub means opening it up to the community and making it accessible to anyone looking to easily use it, in turn eliminating their need to train a model on their own and simplifying sharing and usage. 
 8 | 
 9 | Additionally, sharing a model on the Hub automatically deploys a hosted Inference API for that model. Anyone in the community is free to test it out directly on the model's page, with custom inputs and appropriate widgets.
10 | 
11 | The best part is that sharing and using any public model on the Hub is completely free! [Paid plans](https://huggingface.co/pricing) also exist if you wish to share models privately.
12 | 
13 | The video below shows how to navigate the Hub.
14 | 
15 | <Youtube id="XvSGPZFEjDY"/>
16 | 
17 | Having a huggingface.co account is required to follow along this part, as we'll be creating and managing repositories on the Hugging Face Hub: [create an account](https://huggingface.co/join)


--------------------------------------------------------------------------------
/chapters/en/chapter7/1.mdx:
--------------------------------------------------------------------------------
 1 | <FrameworkSwitchCourse {fw} />
 2 | 
 3 | # Introduction
 4 | 
 5 | In [Chapter 3](/course/chapter3), you saw how to fine-tune a model for text classification. In this chapter, we will tackle the following common NLP tasks:
 6 | 
 7 | - Token classification
 8 | - Masked language modeling (like BERT)
 9 | - Summarization
10 | - Translation
11 | - Causal language modeling pretraining (like GPT-2)
12 | - Question answering
13 | 
14 | {#if fw === 'pt'}
15 | 
16 | To do this, you'll need to leverage everything you learned about the `Trainer` API and the 🤗 Accelerate library in [Chapter 3](/course/chapter3), the 🤗 Datasets library in [Chapter 5](/course/chapter5), and the 🤗 Tokenizers library in [Chapter 6](/course/chapter6). We'll also upload our results to the Model Hub, like we did in [Chapter 4](/course/chapter4), so this is really the chapter where everything comes together!
17 | 
18 | Each section can be read independently and will show you how to train a model with the `Trainer` API or with your own training loop, using 🤗 Accelerate. Feel free to skip either part and focus on the one that interests you the most: the `Trainer` API is great for fine-tuning or training your model without worrying about what's going on behind the scenes, while the training loop with `Accelerate` will let you customize any part you want more easily.
19 | 
20 | {:else}
21 | 
22 | To do this, you'll need to leverage everything you learned about training models with the Keras API in [Chapter 3](/course/chapter3), the 🤗 Datasets library in [Chapter 5](/course/chapter5), and the 🤗 Tokenizers library in [Chapter 6](/course/chapter6). We'll also upload our results to the Model Hub, like we did in [Chapter 4](/course/chapter4), so this is really the chapter where everything comes together!
23 | 
24 | Each section can be read independently.
25 | 
26 | {/if}
27 | 
28 | 
29 | <Tip>
30 | 
31 | If you read the sections in sequence, you will notice that they have quite a bit of code and prose in common. The repetition is intentional, to allow you to dip in (or come back later) to any task that interests you and find a complete working example.
32 | 
33 | </Tip>
34 | 


--------------------------------------------------------------------------------
/chapters/en/chapter1/2.mdx:
--------------------------------------------------------------------------------
 1 | # Natural Language Processing
 2 | 
 3 | Before jumping into Transformer models, let's do a quick overview of what natural language processing is and why we care about it.
 4 | 
 5 | ## What is NLP?
 6 | 
 7 | NLP is a field of linguistics and machine learning focused on understanding everything related to human language. The aim of NLP tasks is not only to understand single words individually, but to be able to understand the context of those words.
 8 | 
 9 | The following is a list of common NLP tasks, with some examples of each:
10 | 
11 | - **Classifying whole sentences**: Getting the sentiment of a review, detecting if an email is spam, determining if a sentence is grammatically correct or whether two sentences are logically related or not
12 | - **Classifying each word in a sentence**: Identifying the grammatical components of a sentence (noun, verb, adjective), or the named entities (person, location, organization)
13 | - **Generating text content**: Completing a prompt with auto-generated text, filling in the blanks in a text with masked words
14 | - **Extracting an answer from a text**: Given a question and a context, extracting the answer to the question based on the information provided in the context
15 | - **Generating a new sentence from an input text**: Translating a text into another language, summarizing a text
16 | 
17 | NLP isn't limited to written text though. It also tackles complex challenges in speech recognition and computer vision, such as generating a transcript of an audio sample or a description of an image.
18 | 
19 | ## Why is it challenging?
20 | 
21 | Computers don't process information in the same way as humans. For example, when we read the sentence "I am hungry," we can easily understand its meaning. Similarly, given two sentences such as "I am hungry" and "I am sad," we're able to easily determine how similar they are. For machine learning (ML) models, such tasks are more difficult. The text needs to be processed in a way that enables the model to learn from it. And because language is complex, we need to think carefully about how this processing must be done. There has been a lot of research done on how to represent text, and we will look at some methods in the next chapter.
22 | 


--------------------------------------------------------------------------------
/chapters/en/chapter1/8.mdx:
--------------------------------------------------------------------------------
 1 | # Bias and limitations
 2 | 
 3 | <DocNotebookDropdown
 4 |   classNames="absolute z-10 right-0 top-0"
 5 |   options={[
 6 |     {label: "Google Colab", value: "https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter1/section8.ipynb"},
 7 |     {label: "Aws Studio", value: "https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/chapter1/section8.ipynb"},
 8 | ]} />
 9 | 
10 | If your intent is to use a pretrained model or a fine-tuned version in production, please be aware that, while these models are powerful tools, they come with limitations. The biggest of these is that, to enable pretraining on large amounts of data, researchers often scrape all the content they can find, taking the best as well as the worst of what is available on the internet. 
11 | 
12 | To give a quick illustration, let's go back the example of a `fill-mask` pipeline with the BERT model:
13 | 
14 | ```python
15 | from transformers import pipeline
16 | 
17 | unmasker = pipeline("fill-mask", model="bert-base-uncased")
18 | result = unmasker("This man works as a [MASK].")
19 | print([r["token_str"] for r in result])
20 | 
21 | result = unmasker("This woman works as a [MASK].")
22 | print([r["token_str"] for r in result])
23 | ```
24 | 
25 | ```python out
26 | ['lawyer', 'carpenter', 'doctor', 'waiter', 'mechanic']
27 | ['nurse', 'waitress', 'teacher', 'maid', 'prostitute']
28 | ```
29 | 
30 | When asked to fill in the missing word in these two sentences, the model gives only one gender-free answer (waiter/waitress). The others are work occupations usually associated with one specific gender -- and yes, prostitute ended up in the top 5 possibilities the model associates with "woman" and "work." This happens even though BERT is one of the rare Transformer models not built by scraping data from all over the internet, but rather using apparently neutral data (it's trained on the [English Wikipedia](https://huggingface.co/datasets/wikipedia) and [BookCorpus](https://huggingface.co/datasets/bookcorpus) datasets). 
31 | 
32 | When you use these tools, you therefore need to keep in the back of your mind that the original model you are using could very easily generate sexist, racist, or homophobic content. Fine-tuning the model on your data won't make this intrinsic bias disappear.
33 | 


--------------------------------------------------------------------------------
/chapters/en/chapter2/1.mdx:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | As you saw in [Chapter 1](/course/chapter1), Transformer models are usually very large. With millions to tens of *billions* of parameters, training and deploying these models is a complicated undertaking. Furthermore, with new models being released on a near-daily basis and each having its own implementation, trying them all out is no easy task.
 4 | 
 5 | The 🤗 Transformers library was created to solve this problem. Its goal is to provide a single API through which any Transformer model can be loaded, trained, and saved. The library's main features are:
 6 | 
 7 | - **Ease of use**: Downloading, loading, and using a state-of-the-art NLP model for inference can be done in just two lines of code.
 8 | - **Flexibility**: At their core, all models are simple PyTorch `nn.Module` or TensorFlow `tf.keras.Model` classes and can be handled like any other models in their respective machine learning (ML) frameworks.
 9 | - **Simplicity**: Hardly any abstractions are made across the library. The "All in one file" is a core concept: a model's forward pass is entirely defined in a single file, so that the code itself is understandable and hackable.
10 | 
11 | This last feature makes 🤗 Transformers quite different from other ML libraries. The models are not built on modules 
12 | that are shared across files; instead, each model has its own layers. In addition to making the models more approachable and understandable, this allows you to easily experiment on one model without affecting others.
13 | 
14 | This chapter will begin with an end-to-end example where we use a model and a tokenizer together to replicate the `pipeline()` function introduced in [Chapter 1](/course/chapter1). Next, we'll discuss the model API: we'll dive into the model and configuration classes, and show you how to load a model and how it processes numerical inputs to output predictions. 
15 | 
16 | Then we'll look at the tokenizer API, which is the other main component of the `pipeline()` function. Tokenizers take care of the first and last processing steps, handling the conversion from text to numerical inputs for the neural network, and the conversion back to text when it is needed. Finally, we'll show you how to handle sending multiple sentences through a model in a prepared batch, then wrap it all up with a closer look at the high-level `tokenizer()` function.
17 | 
18 | <Tip>
19 | ⚠️ In order to benefit from all features available with the Model Hub and 🤗 Transformers, we recommend <a href="https://huggingface.co/join">creating an account</a>.
20 | </Tip>


--------------------------------------------------------------------------------
/utils/code_formatter.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import black
 3 | import os
 4 | import re
 5 | from pathlib import Path
 6 | 
 7 | def blackify(filename, check_only=False):
 8 |     # Read the content of the file
 9 |     with open(filename, "r", encoding="utf-8") as f:
10 |         content = f.read()
11 |     lines = content.split("\n")
12 | 
13 |     # Split the content into code samples in py or python blocks.
14 |     code_samples = []
15 |     line_index = 0
16 |     while line_index < len(lines):
17 |         line = lines[line_index]
18 |         if line.strip() in ["```py", "```python"]:
19 |             line_index += 1
20 |             start_index = line_index
21 |             while line_index < len(lines) and lines[line_index].strip() != "```":
22 |                 line_index += 1
23 |             
24 |             code = "\n".join(lines[start_index: line_index])
25 |             # Deal with ! instructions
26 |             code = re.sub(r"^!", r"## !", code, flags=re.MULTILINE)
27 |             
28 |             code_samples.append({
29 |                 "start_index": start_index,
30 |                 "end_index": line_index - 1,
31 |                 "code": code
32 |             })
33 |             line_index += 1
34 |         else:
35 |             line_index += 1
36 | 
37 |     # Let's blackify the code! We put everything in one big text to go faster.
38 |     delimiter = "\n\n### New cell ###\n"
39 |     full_code = delimiter.join([sample["code"] for sample in code_samples])
40 |     formatted_code = full_code.replace("\t", "    ")
41 |     formatted_code = black.format_str(formatted_code, mode=black.FileMode({black.TargetVersion.PY37}, line_length=90))
42 |     
43 |     # Black adds last new lines we don't want, so we strip individual code samples.
44 |     cells = formatted_code.split(delimiter)
45 |     cells = [cell.strip() for cell in cells]
46 |     formatted_code = delimiter.join(cells)
47 |     
48 |     if check_only:
49 |         return full_code == formatted_code
50 |     elif full_code == formatted_code:
51 |         # Nothing to do, all is good
52 |         return
53 |     
54 |     formatted_code = re.sub(r"^## !", r"!", formatted_code, flags=re.MULTILINE)
55 |     print(f"Formatting {filename}")
56 |     # Re-build the content with formatted code
57 |     new_lines = []
58 |     start_index = 0
59 |     for sample, code in zip(code_samples, formatted_code.split(delimiter)):
60 |         new_lines.extend(lines[start_index:sample["start_index"]])
61 |         new_lines.append(code)
62 |         start_index = sample["end_index"] + 1
63 |     new_lines.extend(lines[start_index:])
64 |     
65 | 
66 |     with open(filename, "w", encoding="utf-8") as f:
67 |         f.write("\n".join(new_lines))
68 | 
69 | 
70 | def format_all_files(check_only=False):
71 |     failures = []
72 |     for filename in Path("chapters").glob("**/*.mdx"):
73 |         try:
74 |             same = blackify(filename, check_only=check_only)
75 |             if check_only and not same:
76 |                 failures.append(filename)
77 |         except Exception:
78 |             print(f"Failed to format {filename}.")
79 |             raise
80 |     
81 |     if check_only and len(failures) > 0:
82 |         raise ValueError(f"{len(failures)} files need to be formatted, run `make style`.")
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     parser = argparse.ArgumentParser()
87 |     parser.add_argument("--check_only", action="store_true", help="Just check files are properly formatted.")
88 |     args = parser.parse_args()
89 | 
90 |     format_all_files(check_only=args.check_only)
91 | 


--------------------------------------------------------------------------------
/chapters/en/_toctree.yml:
--------------------------------------------------------------------------------
  1 | - title: 0. Setup
  2 |   sections:
  3 |   - local: chapter0/1
  4 |     title: Introduction
  5 | 
  6 | - title: 1. Transformer models
  7 |   sections:
  8 |   - local: chapter1/1
  9 |     title: Introduction
 10 |   - local: chapter1/2
 11 |     title: Natural Language Processing
 12 |   - local: chapter1/3
 13 |     title: Transformers, what can they do?
 14 |   - local: chapter1/4
 15 |     title: How do Transformers work?
 16 |   - local: chapter1/5
 17 |     title: Encoder models
 18 |   - local: chapter1/6
 19 |     title: Decoder models
 20 |   - local: chapter1/7
 21 |     title: Sequence-to-sequence models
 22 |   - local: chapter1/8
 23 |     title: Bias and limitations
 24 |   - local: chapter1/9
 25 |     title: Summary
 26 |   - local: chapter1/10
 27 |     title: End-of-chapter quiz
 28 |     quiz: 1
 29 | 
 30 | - title: 2. Using 🤗 Transformers
 31 |   sections:
 32 |   - local: chapter2/1
 33 |     title: Introduction
 34 |   - local: chapter2/2
 35 |     title: Behind the pipeline
 36 |   - local: chapter2/3
 37 |     title: Models
 38 |   - local: chapter2/4
 39 |     title: Tokenizers
 40 |   - local: chapter2/5
 41 |     title: Handling multiple sequences
 42 |   - local: chapter2/6
 43 |     title: Putting it all together
 44 |   - local: chapter2/7
 45 |     title: Basic usage completed!
 46 |   - local: chapter2/8
 47 |     title: End-of-chapter quiz
 48 |     quiz: 2
 49 | 
 50 | - title: 3. Fine-tuning a pretrained model
 51 |   sections:
 52 |   - local: chapter3/1
 53 |     title: Introduction
 54 |   - local: chapter3/2
 55 |     title: Processing the data
 56 |   - local: chapter3/3
 57 |     title: Fine-tuning a model with the Trainer API or Keras
 58 |     local_fw: { pt: chapter3/3, tf: chapter3/3_tf }
 59 |   - local: chapter3/4
 60 |     title: A full training
 61 |   - local: chapter3/5
 62 |     title: Fine-tuning, Check!
 63 |   - local: chapter3/6
 64 |     title: End-of-chapter quiz
 65 |     quiz: 3
 66 | 
 67 | - title: 4. Sharing models and tokenizers
 68 |   sections:
 69 |   - local: chapter4/1
 70 |     title: The Hugging Face Hub
 71 |   - local: chapter4/2
 72 |     title: Using pretrained models
 73 |   - local: chapter4/3
 74 |     title: Sharing pretrained models
 75 |   - local: chapter4/4
 76 |     title: Building a model card
 77 |   - local: chapter4/5
 78 |     title: Part 1 completed!
 79 |   - local: chapter4/6
 80 |     title: End-of-chapter quiz
 81 |     quiz: 4
 82 | 
 83 | - title: 5. The 🤗 Datasets library
 84 |   sections:
 85 |   - local: chapter5/1
 86 |     title: Introduction
 87 |   - local: chapter5/2
 88 |     title: What if my dataset isn't on the Hub?
 89 |   - local: chapter5/3
 90 |     title: Time to slice and dice
 91 |   - local: chapter5/4
 92 |     title: Big data? 🤗 Datasets to the rescue!
 93 |   - local: chapter5/5
 94 |     title: Creating your own dataset
 95 |   - local: chapter5/6
 96 |     title: Semantic search with FAISS
 97 |   - local: chapter5/7
 98 |     title: 🤗 Datasets, check!
 99 |   - local: chapter5/8
100 |     title: End-of-chapter quiz
101 |     quiz: 5
102 | 
103 | - title: 6. The 🤗 Tokenizers library
104 |   sections:
105 |   - local: chapter6/1
106 |     title: Introduction
107 |   - local: chapter6/2
108 |     title: Training a new tokenizer from an old one
109 |   - local: chapter6/3
110 |     title: Fast tokenizers' special powers
111 |   - local: chapter6/3b
112 |     title: Fast tokenizers in the QA pipeline
113 |   - local: chapter6/4
114 |     title: Normalization and pre-tokenization
115 |   - local: chapter6/5
116 |     title: Byte-Pair Encoding tokenization
117 |   - local: chapter6/6
118 |     title: WordPiece tokenization
119 |   - local: chapter6/7
120 |     title: Unigram tokenization
121 |   - local: chapter6/8
122 |     title: Building a tokenizer, block by block
123 |   - local: chapter6/9
124 |     title: Tokenizers, check!
125 |   - local: chapter6/10
126 |     title: End-of-chapter quiz
127 |     quiz: 6
128 | 
129 | - title: 7. Main NLP tasks
130 |   sections:
131 |   - local: chapter7/1
132 |     title: Introduction
133 |   - local: chapter7/2
134 |     title: Token classification
135 |   - local: chapter7/3
136 |     title: Fine-tuning a masked language model
137 |   - local: chapter7/4
138 |     title: Translation
139 |   - local: chapter7/5
140 |     title: Summarization
141 |   - local: chapter7/6
142 |     title: Training a causal language model from scratch
143 |   - local: chapter7/7
144 |     title: Question answering
145 |   - local: chapter7/8
146 |     title: Mastering NLP
147 |   - local: chapter7/9
148 |     title: End-of-chapter quiz
149 |     quiz: 7
150 | 
151 | - title: 8. How to ask for help
152 |   sections:
153 |   - local: chapter8/1
154 |     title: Introduction
155 |   - local: chapter8/2
156 |     title: What to do when you get an error
157 |   - local: chapter8/3
158 |     title: Asking for help on the forums
159 |   - local: chapter8/4
160 |     title: Debugging the training pipeline
161 |     local_fw: { pt: chapter8/4, tf: chapter8/4_tf }
162 |   - local: chapter8/5
163 |     title: How to write a good issue
164 |   - local: chapter8/6
165 |     title: Part 2 completed!
166 |   - local: chapter8/7
167 |     title: End-of-chapter quiz
168 |     quiz: 8
169 | 
170 | - title: Hugging Face Course Event
171 |   sections:
172 |   - local: event/1
173 |     title: Part 2 Release Event
174 | 


--------------------------------------------------------------------------------
/chapters/en/chapter4/2.mdx:
--------------------------------------------------------------------------------
 1 | <FrameworkSwitchCourse {fw} />
 2 | 
 3 | # Using pretrained models
 4 | 
 5 | {#if fw === 'pt'}
 6 | 
 7 | <DocNotebookDropdown
 8 |   classNames="absolute z-10 right-0 top-0"
 9 |   options={[
10 |     {label: "Google Colab", value: "https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter4/section2_pt.ipynb"},
11 |     {label: "Aws Studio", value: "https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/chapter4/section2_pt.ipynb"},
12 | ]} />
13 | 
14 | {:else}
15 | 
16 | <DocNotebookDropdown
17 |   classNames="absolute z-10 right-0 top-0"
18 |   options={[
19 |     {label: "Google Colab", value: "https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter4/section2_tf.ipynb"},
20 |     {label: "Aws Studio", value: "https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/chapter4/section2_tf.ipynb"},
21 | ]} />
22 | 
23 | {/if}
24 | 
25 | The Model Hub makes selecting the appropriate model simple, so that using it in any downstream library can be done in a few lines of code. Let's take a look at how to actually use one of these models, and how to contribute back to the community.
26 | 
27 | Let's say we're looking for a French-based model that can perform mask filling.
28 | 
29 | <div class="flex justify-center">
30 | <img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter4/camembert.gif" alt="Selecting the Camembert model." width="80%"/>
31 | </div>
32 | 
33 | We select the `camembert-base` checkpoint to try it out. The identifier `camembert-base` is all we need to start using it! As you've seen in previous chapters, we can instantiate it using the `pipeline()` function:
34 | 
35 | ```py
36 | from transformers import pipeline
37 | 
38 | camembert_fill_mask = pipeline("fill-mask", model="camembert-base")
39 | results = camembert_fill_mask("Le camembert est <mask> :)")
40 | ```
41 | 
42 | ```python out
43 | [
44 |   {'sequence': 'Le camembert est délicieux :)', 'score': 0.49091005325317383, 'token': 7200, 'token_str': 'délicieux'}, 
45 |   {'sequence': 'Le camembert est excellent :)', 'score': 0.1055697426199913, 'token': 2183, 'token_str': 'excellent'}, 
46 |   {'sequence': 'Le camembert est succulent :)', 'score': 0.03453313186764717, 'token': 26202, 'token_str': 'succulent'}, 
47 |   {'sequence': 'Le camembert est meilleur :)', 'score': 0.0330314114689827, 'token': 528, 'token_str': 'meilleur'}, 
48 |   {'sequence': 'Le camembert est parfait :)', 'score': 0.03007650189101696, 'token': 1654, 'token_str': 'parfait'}
49 | ]
50 | ```
51 | 
52 | As you can see, loading a model within a pipeline is extremely simple. The only thing you need to watch out for is that the chosen checkpoint is suitable for the task it's going to be used for. For example, here we are loading the `camembert-base` checkpoint in the `fill-mask` pipeline, which is completely fine. But if we were to load this checkpoint in the `text-classification` pipeline, the results would not make any sense because the head of `camembert-base` is not suitable for this task! We recommend using the task selector in the Hugging Face Hub interface in order to select the appropriate checkpoints:
53 | 
54 | <div class="flex justify-center">
55 | <img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter4/tasks.png" alt="The task selector on the web interface." width="80%"/>
56 | </div>
57 | 
58 | You can also instantiate the checkpoint using the model architecture directly:
59 | 
60 | {#if fw === 'pt'}
61 | ```py
62 | from transformers import CamembertTokenizer, CamembertForMaskedLM
63 | 
64 | tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
65 | model = CamembertForMaskedLM.from_pretrained("camembert-base")
66 | ```
67 | 
68 | However, we recommend using the [`Auto*` classes](https://huggingface.co/transformers/model_doc/auto.html?highlight=auto#auto-classes) instead, as these are by design architecture-agnostic. While the previous code sample limits users to checkpoints loadable in the CamemBERT architecture, using the `Auto*` classes makes switching checkpoints simple:
69 | 
70 | ```py
71 | from transformers import AutoTokenizer, AutoModelForMaskedLM
72 | 
73 | tokenizer = AutoTokenizer.from_pretrained("camembert-base")
74 | model = AutoModelForMaskedLM.from_pretrained("camembert-base")
75 | ```
76 | {:else}
77 | ```py
78 | from transformers import CamembertTokenizer, TFCamembertForMaskedLM
79 | 
80 | tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
81 | model = TFCamembertForMaskedLM.from_pretrained("camembert-base")
82 | ```
83 | 
84 | However, we recommend using the [`TFAuto*` classes](https://huggingface.co/transformers/model_doc/auto.html?highlight=auto#auto-classes) instead, as these are by design architecture-agnostic. While the previous code sample limits users to checkpoints loadable in the CamemBERT architecture, using the `TFAuto*` classes makes switching checkpoints simple:
85 | 
86 | ```py
87 | from transformers import AutoTokenizer, TFAutoModelForMaskedLM
88 | 
89 | tokenizer = AutoTokenizer.from_pretrained("camembert-base")
90 | model = TFAutoModelForMaskedLM.from_pretrained("camembert-base")
91 | ```
92 | {/if}
93 | 
94 | <Tip>
95 | When using a pretrained model, make sure to check how it was trained, on which datasets, its limits, and its biases. All of this information should be indicated on its model card.
96 | </Tip>
97 | 


--------------------------------------------------------------------------------
/chapters/en/chapter1/1.mdx:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | ## Welcome to the 🤗 Course!
 4 | 
 5 | <Youtube id="00GKzGyWFEs" />
 6 | 
 7 | This course will teach you about natural language processing (NLP) using libraries from the [Hugging Face](https://huggingface.co/) ecosystem — [🤗 Transformers](https://github.com/huggingface/transformers), [🤗 Datasets](https://github.com/huggingface/datasets), [🤗 Tokenizers](https://github.com/huggingface/tokenizers), and [🤗 Accelerate](https://github.com/huggingface/accelerate) — as well as the [Hugging Face Hub](https://huggingface.co/models). It's completely free and without ads.
 8 | 
 9 | 
10 | ## What to expect?
11 | 
12 | Here is a brief overview of the course:
13 | 
14 | <div class="flex justify-center">
15 | <img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter1/summary.svg" alt="Brief overview of the chapters of the course.">
16 | <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter1/summary-dark.svg" alt="Brief overview of the chapters of the course.">
17 | </div>
18 | 
19 | - Chapters 1 to 4 provide an introduction to the main concepts of the 🤗 Transformers library. By the end of this part of the course, you will be familiar with how Transformer models work and will know how to use a model from the [Hugging Face Hub](https://huggingface.co/models), fine-tune it on a dataset, and share your results on the Hub!
20 | - Chapters 5 to 8 teach the basics of 🤗 Datasets and 🤗 Tokenizers before diving into classic NLP tasks. By the end of this part, you will be able to tackle the most common NLP problems by yourself.
21 | - Chapters 9 to 12 go beyond NLP, and explore how Transformer models can be used tackle tasks in speech processing and computer vision. Along the way, you'll learn how to build and share demos of your models, and optimize them for production environments. By the end of this part, you will be ready to apply 🤗 Transformers to (almost) any machine learning problem!
22 | 
23 | This course:
24 | 
25 | * Requires a good knowledge of Python
26 | * Is better taken after an introductory deep learning course, such as [fast.ai's](https://www.fast.ai/) [Practical Deep Learning for Coders](https://course.fast.ai/) or one of the programs developed by [DeepLearning.AI](https://www.deeplearning.ai/)
27 | * Does not expect prior [PyTorch](https://pytorch.org/) or [TensorFlow](https://www.tensorflow.org/) knowledge, though some familiarity with either of those will help
28 | 
29 | After you've completed this course, we recommend checking out DeepLearning.AI's [Natural Language Processing Specialization](https://www.coursera.org/specializations/natural-language-processing?utm_source=deeplearning-ai&utm_medium=institutions&utm_campaign=20211011-nlp-2-hugging_face-page-nlp-refresh), which covers a wide range of traditional NLP models like naive Bayes and LSTMs that are well worth knowing about!
30 | 
31 | ## Who are we?
32 | 
33 | About the authors:
34 | 
35 | **Matthew Carrigan** is a Machine Learning Engineer at Hugging Face. He lives in Dublin, Ireland and previously worked as an ML engineer at Parse.ly and before that as a post-doctoral researcher at Trinity College Dublin. He does not believe we're going to get to AGI by scaling existing architectures, but has high hopes for robot immortality regardless.
36 | 
37 | **Lysandre Debut** is a Machine Learning Engineer at Hugging Face and has been working on the 🤗 Transformers library since the very early development stages. His aim is to make NLP accessible for everyone by developing tools with a very simple API.
38 | 
39 | **Sylvain Gugger** is a Research Engineer at Hugging Face and one of the core maintainers of the 🤗 Transformers library. Previously he was a Research Scientist at fast.ai, and he co-wrote _[Deep Learning for Coders with fastai and PyTorch](https://learning.oreilly.com/library/view/deep-learning-for/9781492045519/)_ with Jeremy Howard. The main focus of his research is on making deep learning more accessible, by designing and improving techniques that allow models to train fast on limited resources.
40 | 
41 | **Merve Noyan** is a developer advocate at Hugging Face, working on developing tools and building content around them to democratize machine learning for everyone.
42 | 
43 | **Lucile Saulnier** is a machine learning engineer at Hugging Face, developing and supporting the use of open source tools. She is also actively involved in many research projects in the field of Natural Language Processing such as collaborative training and BigScience.
44 | 
45 | **Lewis Tunstall**  is a machine learning engineer at Hugging Face, focused on developing open-source tools and making them accessible to the wider community. He is also a co-author of an upcoming [O’Reilly book on Transformers](https://www.oreilly.com/library/view/natural-language-processing/9781098103231/).
46 | 
47 | **Leandro von Werra**  is a machine learning engineer in the open-source team at Hugging Face and also a co-author of the an upcoming [O’Reilly book on Transformers](https://www.oreilly.com/library/view/natural-language-processing/9781098103231/). He has several years of industry experience bringing NLP projects to production by working across the whole machine learning stack..
48 | 
49 | Are you ready to roll? In this chapter, you will learn:
50 | * How to use the `pipeline()` function to solve NLP tasks such as text generation and classification
51 | * About the Transformer architecture
52 | * How to distinguish between encoder, decoder, and encoder-decoder architectures and use cases
53 | 


--------------------------------------------------------------------------------
/chapters/en/chapter4/4.mdx:
--------------------------------------------------------------------------------
 1 | # Building a model card
 2 | 
 3 | The model card is a file which is arguably as important as the model and tokenizer files in a model repository. It is the central definition of the model, ensuring reusability by fellow community members and reproducibility of results, and providing a platform on which other members may build their artifacts. 
 4 | 
 5 | Documenting the training and evaluation process helps others understand what to expect of a model — and providing sufficient information regarding the data that was used and the preprocessing and postprocessing that were done ensures that the limitations, biases, and contexts in which the model is and is not useful can be identified and understood.
 6 | 
 7 | Therefore, creating a model card that clearly defines your model is a very important step. Here, we provide some tips that will help you with this. Creating the model card is done through the *README.md* file you saw earlier, which is a Markdown file.
 8 | 
 9 | The "model card" concept originates from a research direction from Google, first shared in the paper ["Model Cards for Model Reporting"](https://arxiv.org/abs/1810.03993) by Margaret Mitchell et al. A lot of information contained here is based on that paper, and we recommend you take a look at it to understand why model cards are so important in a world that values reproducibility, reusability, and fairness.
10 | 
11 | The model card usually starts with a very brief, high-level overview of what the model is for, followed by additional details in the following sections:
12 | 
13 | - Model description
14 | - Intended uses & limitations
15 | - How to use
16 | - Limitations and bias
17 | - Training data 
18 | - Training procedure
19 | - Evaluation results 
20 | 
21 | Let's take a look at what each of these sections should contain.
22 | 
23 | ### Model description
24 | 
25 | The model description provides basic details about the model. This includes the architecture, version, if it was introduced in a paper, if an original implementation is available, the author, and general information about the model. Any copyright should be attributed here. General information about training procedures, parameters, and important disclaimers can also be mentioned in this section.
26 | 
27 | ### Intended uses & limitations
28 | 
29 | Here you describe the use cases the model is intended for, including the languages, fields, and domains where it can be applied. This section of the model card can also document areas that are known to be out of scope for the model, or where it is likely to perform suboptimally.
30 | 
31 | ### How to use
32 | 
33 | This section should include some examples of how to use the model. This can showcase usage of the `pipeline()` function, usage of the model and tokenizer classes, and any other code you think might be helpful.
34 | 
35 | ### Training data
36 | 
37 | This part should indicate which dataset(s) the model was trained on. A brief description of the dataset(s) is also welcome.
38 | 
39 | ### Training procedure
40 | 
41 | In this section you should describe all the relevant aspects of training that are useful from a reproducibility perspective. This includes any preprocessing and postprocessing that were done on the data, as well as details such as the number of epochs the model was trained for, the batch size, the learning rate, and so on.
42 | 
43 | ### Variable and metrics
44 | 
45 | Here you should describe the metrics you use for evaluation, and the different factors you are mesuring. Mentioning which metric(s) were used, on which dataset and which dataset split, makes it easy to compare you model's performance compared to that of other models. These should be informed by the previous sections, such as the intended users and use cases.
46 | 
47 | ### Evaluation results
48 | 
49 | Finally, provide an indication of how well the model performs on the evaluation dataset. If the model uses a decision threshold, either provide the decision threshold used in the evaluation, or provide details on evaluation at different thresholds for the intended uses.
50 | 
51 | ## Example
52 | 
53 | Check out the following for a few examples of well-crafted model cards:
54 | 
55 | - [`bert-base-cased`](https://huggingface.co/bert-base-cased)
56 | - [`gpt2`](https://huggingface.co/gpt2)
57 | - [`distilbert`](https://huggingface.co/distilbert-base-uncased)
58 | 
59 | More examples from different organizations and companies are available [here](https://github.com/huggingface/model_card/blob/master/examples.md).
60 | 
61 | ## Note
62 | 
63 | Model cards are not a requirement when publishing models, and you don't need to include all of the sections described above when you make one. However, explicit documentation of the model can only benefit future users, so we recommend that you fill in as many of the sections as possible to the best of your knowledge and ability. 
64 | 
65 | ## Model card metadata
66 | 
67 | If you have done a little exploring of the Hugging Face Hub, you should have seen that some models belong to certain categories: you can filter them by tasks, languages, libraries, and more. The categories a model belongs to are identified according to the metadata you add in the model card header.
68 | 
69 | For example, if you take a look at the [`camembert-base` model card](https://huggingface.co/camembert-base/blob/main/README.md), you should see the following lines in the model card header:
70 | 
71 | ```
72 | ---
73 | language: fr
74 | license: mit
75 | datasets:
76 | - oscar
77 | ---
78 | ```
79 | 
80 | This metadata is parsed by the Hugging Face Hub, which then identifies this model as being a French model, with an MIT license, trained on the Oscar dataset. 
81 | 
82 | The [full model card specification](https://github.com/huggingface/hub-docs/blame/main/modelcard.md) allows specifying languages, licenses, tags, datasets, metrics, as well as the evaluation results the model obtained when training. 
83 | 


--------------------------------------------------------------------------------
/chapters/en/chapter0/1.mdx:
--------------------------------------------------------------------------------
  1 | # Introduction
  2 | 
  3 | Welcome to the Hugging Face course! This introduction will guide you through setting up a working environment. If you're just starting the course, we recommend you first take a look at [Chapter 1](/course/chapter1), then come back and set up your environment so you can try the code yourself.
  4 | 
  5 | All the libraries that we'll be using in this course are available as Python packages, so here we'll show you how to set up a Python environment and install the specific libraries you'll need.
  6 | 
  7 | We'll cover two ways of setting up your working environment, using a Colab notebook or a Python virtual environment. Feel free to choose the one that resonates with you the most. For beginners, we strongly recommend that you get started by using a Colab notebook.
  8 | 
  9 | Note that we will not be covering the Windows system. If you're running on Windows, we recommend following along using a Colab notebook. If you're using a Linux distribution or macOS, you can use either approach described here.
 10 | 
 11 | Most of the course relies on you having a Hugging Face account. We recommend creating one now: [create an account](https://huggingface.co/join).
 12 | 
 13 | ## Using a Google Colab notebook
 14 | 
 15 | Using a Colab notebook is the simplest possible setup; boot up a notebook in your browser and get straight to coding! 
 16 | 
 17 | If you're not familiar with Colab, we recommend you start by following the [introduction](https://colab.research.google.com/notebooks/intro.ipynb). Colab allows you to use some accelerating hardware, like GPUs or TPUs, and it is free for smaller workloads.
 18 | 
 19 | Once you're comfortable moving around in Colab, create a new notebook and get started with the setup:
 20 | 
 21 | <div class="flex justify-center">
 22 | <img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter0/new_colab.png" alt="An empty colab notebook" width="80%"/>
 23 | </div>
 24 | 
 25 | The next step is to install the libraries that we'll be using in this course. We'll use `pip` for the installation, which is the package manager for Python. In notebooks, you can run system commands by preceding them with the `!` character, so you can install the 🤗 Transformers library as follows:
 26 | 
 27 | ```
 28 | !pip install transformers
 29 | ```
 30 | 
 31 | You can make sure the package was correctly installed by importing it within your Python runtime:
 32 | 
 33 | ```
 34 | import transformers
 35 | ```
 36 | 
 37 | <div class="flex justify-center">
 38 | <img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter0/install.gif" alt="A gif showing the result of the two commands above: installation and import" width="80%"/>
 39 | </div>
 40 | 
 41 | This installs a very light version of 🤗 Transformers. In particular, no specific machine learning frameworks (like PyTorch or TensorFlow) are installed. Since we'll be using a lot of different features of the library, we recommend installing the development version, which comes with all the required dependencies for pretty much any imaginable use case:
 42 | 
 43 | ```
 44 | !pip install transformers[sentencepiece]
 45 | ```
 46 | 
 47 | This will take a bit of time, but then you'll be ready to go for the rest of the course!
 48 | 
 49 | ## Using a Python virtual environment
 50 | 
 51 | If you prefer to use a Python virtual environment, the first step is to install Python on your system. We recommend following [this guide](https://realpython.com/installing-python/) to get started.
 52 | 
 53 | Once you have Python installed, you should be able to run Python commands in your terminal. You can start by running the following command to ensure that it is correctly installed before proceeding to the next steps: `python --version`. This should print out the Python version now available on your system.
 54 | 
 55 | When running a Python command in your terminal, such as `python --version`, you should think of the program running your command as the "main" Python on your system. We recommend keeping this main installation free of any packages, and using it to create separate environments for each application you work on — this way, each application can have its own dependencies and packages, and you won't need to worry about potential compatibility issues with other applications.
 56 | 
 57 | In Python this is done with [*virtual environments*](https://docs.python.org/3/tutorial/venv.html), which are self-contained directory trees that each contain a Python installation with a particular Python version alongside all the packages the application needs. Creating such a virtual environment can be done with a number of different tools, but we'll use the official Python package for that purpose, which is called [`venv`](https://docs.python.org/3/library/venv.html#module-venv).
 58 | 
 59 | First, create the directory you'd like your application to live in — for example, you might want to make a new directory called *transformers-course* at the root of your home directory:
 60 | 
 61 | ```
 62 | mkdir ~/transformers-course
 63 | cd ~/transformers-course
 64 | ```
 65 | 
 66 | From inside this directory, create a virtual environment using the Python `venv` module:
 67 | 
 68 | ```
 69 | python -m venv .env
 70 | ```
 71 | 
 72 | You should now have a directory called *.env* in your otherwise empty folder:
 73 | 
 74 | ```
 75 | ls -a
 76 | ```
 77 | 
 78 | ```out
 79 | .      ..    .env
 80 | ```
 81 | 
 82 | You can jump in and out of your virtual environment with the `activate` and `deactivate` scripts:
 83 | 
 84 | ```
 85 | # Activate the virtual environment
 86 | source .env/bin/activate
 87 | 
 88 | # Deactivate the virtual environment
 89 | source .env/bin/deactivate
 90 | ```
 91 | 
 92 | You can make sure that the environment is activated by running the `which python` command: if it points to the virtual environment, then you have successfully activated it!
 93 | 
 94 | ```
 95 | which python
 96 | ```
 97 | 
 98 | ```out
 99 | /home/<user>/transformers-course/.env/bin/python
100 | ```
101 | 
102 | ### Installing dependencies
103 | 
104 | As in the previous section on using Google Colab instances, you'll now need to install the packages required to continue. Again, you can install the development version of 🤗 Transformers using the `pip` package manager:
105 | 
106 | ```
107 | pip install "transformers[sentencepiece]"
108 | ```
109 | 
110 | You're now all set up and ready to go!
111 | 


--------------------------------------------------------------------------------
/chapters/es/chapter0/section1.mdx:
--------------------------------------------------------------------------------
  1 | Bienvenido al curso de Hugging Face. Esta introducción te guiará en la configuración de un entorno de trabajo. Si acabas de empezar el curso, te recomendamos que primero eches un vistazo al [Capítulo 1](/course/chapter1), y luego vuelvas y configures tu entorno para poder probar el código por ti mismo.
  2 | 
  3 | Todas las bibliotecas que usaremos en este curso están disponibles como paquetes de Python, así que aquí te mostraremos cómo configurar un entorno de Python e instalar las bibliotecas específicas que necesitarás.
  4 | 
  5 | Cubriremos dos formas de configurar tu entorno de trabajo, utilizando un cuaderno Colab o un entorno virtual Python. Siéntete libre de elegir la que más te convenga. Para los principiantes, recomendamos encarecidamente que comiencen utilizando un cuaderno Colab.
  6 | 
  7 | Tenga en cuenta que no vamos a cubrir el sistema Windows. Si está utilizando Windows, le recomendamos que siga utilizando un cuaderno Colab. Si está utilizando una distribución de Linux o macOS, puede utilizar cualquiera de los enfoques descritos aquí.
  8 | 
  9 | La mayor parte del curso depende de que tengas una cuenta de Hugging Face. Te recomendamos que crees una ahora: [crear una cuenta](https://huggingface.co/join).
 10 | 
 11 | ## Uso de un cuaderno Google Colab
 12 | 
 13 | Utilizar un cuaderno Colab es la configuración más sencilla posible; ¡arranca un cuaderno en tu navegador y ponte a codificar directamente! 
 14 | 
 15 | Si no estás familiarizado con Colab, te recomendamos que empieces siguiendo la [introducción](https://colab.research.google.com/notebooks/intro.ipynb). Colab te permite utilizar algún hardware de aceleración, como GPUs o TPUs, y es gratuito para cargas de trabajo pequeñas.
 16 | 
 17 | Una vez que te sientas cómodo moviéndote en Colab, crea un nuevo notebook y comienza con la configuración:
 18 | 
 19 | <div class="flex justify-center">
 20 | <img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter0/new_colab.png" alt="An empty colab notebook" width="80%"/>
 21 | </div>
 22 | 
 23 | El siguiente paso es instalar las librerías que usaremos en este curso. Usaremos `pip` para la instalación, que es el gestor de paquetes para Python. En los cuadernos, puedes ejecutar comandos del sistema precediéndolos con el carácter `!`, así que puedes instalar la librería 🤗 Transformers de la siguiente manera:
 24 | 
 25 | ```
 26 | !pip install transformers
 27 | ```
 28 | 
 29 | Puede asegurarse de que el paquete se ha instalado correctamente importándolo en su tiempo de ejecución de Python:
 30 | 
 31 | ```
 32 | import transformers
 33 | ```
 34 | 
 35 | <div class="flex justify-center">
 36 | <img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter0/install.gif" alt="A gif showing the result of the two commands above: installation and import" width="80%"/>
 37 | </div>
 38 | 
 39 | Esto instala una versión muy ligera de 🤗 Transformers. En particular, no se instalan frameworks específicos de deep learning (como PyTorch o TensorFlow). Dado que vamos a utilizar un montón de características diferentes de la biblioteca, se recomienda instalar la versión de desarrollo, que viene con todas las dependencias necesarias para casi cualquier caso de uso imaginable:
 40 | 
 41 | ```
 42 | !pip install transformers[sentencepiece]
 43 | ```
 44 | 
 45 | Esto te llevará un poco de tiempo, pero luego estarás listo para el resto del curso.
 46 | 
 47 | ## Usar un entorno virtual de Python
 48 | 
 49 | Si prefieres utilizar un entorno virtual de Python, el primer paso es instalar Python en tu sistema. Recomendamos seguir [esta guía](https://realpython.com/installing-python/) para empezar.
 50 | 
 51 | Una vez que tengas Python instalado, deberías poder ejecutar comandos de Python en tu terminal. Puedes empezar ejecutando el siguiente comando para asegurarte de que está correctamente instalado antes de proceder a los siguientes pasos: `python --version`. Esto debería imprimir la versión de Python disponible en tu sistema.
 52 | 
 53 | Cuando ejecutes un comando de Python en tu terminal, como `python --version`, debes pensar en el programa que ejecuta tu comando como el Python "principal" de tu sistema. Recomendamos mantener esta instalación principal libre de paquetes, y usarla para crear entornos separados para cada aplicación en la que trabajes - de esta manera, cada aplicación puede tener sus propias dependencias y paquetes, y no tendrás que preocuparte por posibles problemas de compatibilidad con otras aplicaciones.
 54 | 
 55 | En Python esto se hace con [*entornos virtuales*](https://docs.python.org/3/tutorial/venv.html), que son árboles de directorios autocontenidos que contienen cada uno una instalación de Python con una versión particular de Python junto con todos los paquetes que la aplicación necesita. La creación de un entorno virtual de este tipo puede hacerse con varias herramientas diferentes, pero nosotros utilizaremos el paquete oficial de Python para este fin, que se llama [`venv`](https://docs.python.org/3/library/venv.html#module-venv).
 56 | 
 57 | En primer lugar, crea el directorio en el que te gustaría que viviera tu aplicación - por ejemplo, podrías crear un nuevo directorio llamado *transformers-course* en la raíz de tu directorio personal:
 58 | 
 59 | ```
 60 | mkdir ~/transformers-course
 61 | cd ~/transformers-course
 62 | ```
 63 | 
 64 | Desde este directorio, crea un entorno virtual utilizando el módulo `venv` de Python:
 65 | 
 66 | ```
 67 | python -m venv .env
 68 | ```
 69 | 
 70 | Ahora debería tener un directorio llamado *.env* en su carpeta, por lo demás vacía:
 71 | 
 72 | ```
 73 | ls -a
 74 | ```
 75 | 
 76 | ```out
 77 | .      ..    .env
 78 | ```
 79 | 
 80 | Puedes entrar y salir de tu entorno virtual con los scripts `activate` y `deactivate`:
 81 | 
 82 | ```
 83 | # Activate the virtual environment
 84 | source .env/bin/activate
 85 | 
 86 | # Deactivate the virtual environment
 87 | source .env/bin/deactivate
 88 | ```
 89 | 
 90 | Puedes asegurarte de que el entorno está activado ejecutando el comando `which python`: si apunta al entorno virtual, entonces lo has activado con éxito.
 91 | 
 92 | ```
 93 | which python
 94 | ```
 95 | 
 96 | ```out
 97 | /home/<user>/transformers-course/.env/bin/python
 98 | ```
 99 | 
100 | ### Instalación de dependencias
101 | 
102 | Al igual que en la sección anterior sobre el uso de las instancias de Google Colab, ahora necesitarás instalar los paquetes necesarios para continuar. De nuevo, puedes instalar la versión de desarrollo de 🤗 Transformers utilizando el gestor de paquetes `pip`:
103 | 
104 | ```
105 | pip install "transformers[sentencepiece]"
106 | ```
107 | 
108 | Ya está todo preparado y listo para funcionar.
109 | 


--------------------------------------------------------------------------------
/chapters/en/chapter8/5.mdx:
--------------------------------------------------------------------------------
 1 | # How to write a good issue
 2 | 
 3 | <DocNotebookDropdown
 4 |   classNames="absolute z-10 right-0 top-0"
 5 |   options={[
 6 |     {label: "Google Colab", value: "https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter8/section5.ipynb"},
 7 |     {label: "Aws Studio", value: "https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/chapter8/section5.ipynb"},
 8 | ]} />
 9 | 
10 | When you encounter something that doesn't seem right with one of the Hugging Face libraries, you should definitely let us know so we can fix it (the same goes for any open source library, for that matter). If you are not completely certain whether the bug lies in your own code or one of our libraries, the first place to check is the [forums](https://discuss.huggingface.co/). The community will help you figure this out, and the Hugging Face team also closely watches the discussions there.
11 | 
12 | <Youtube id="_PAli-V4wj0"/>
13 | 
14 | When you are sure you have a bug in your hand, the first step is to build a minimal reproducible example.
15 | 
16 | ## Creating a minimal reproducible example
17 | 
18 | It's very important to isolate the piece of code that produces the bug, as no one in the Hugging Face team is a magician (yet), and they can't fix what they can't see. A minimal reproducible example should, as the name indicates, be reproducible. This means that it should not rely on any external files or data you may have. Try to replace the data you are using with some dummy values that look like your real ones and still produce the same error.
19 | 
20 | <Tip>
21 | 
22 | 🚨 Many issues in the 🤗 Transformers repository are unsolved because the data used to reproduce them is not accessible.
23 | 
24 | </Tip>
25 | 
26 | Once you have something that is self-contained, you can try to reduce it into even less lines of code, building what we call a _minimal reproducible example_. While this requires a bit more work on your side, you will almost be guaranteed to get help and a fix if you provide a nice, short bug reproducer.
27 | 
28 | If you feel comfortable enough, go inspect the source code where your bug happens. You might find a solution to your problem (in which case you can even suggest a pull request to fix it), but more generally, this can help the maintainers better understand the source when they read your report.
29 | 
30 | ## Filling out the issue template
31 | 
32 | When you file your issue, you will notice there is a template to fill out. We will follow the one for [🤗 Transformers issues](https://github.com/huggingface/transformers/issues/new/choose) here, but the same kind of information will be required if you report an issue in another repository. Don't leave the template blank: taking the time to fill it in will maximize your chances of getting an answer and solving your problem.
33 | 
34 | In general, when filing an issue, always stay courteous. This is an open source project, so you are using free software, and no one has any obligation to help you. You may include what you feel is justified criticism in your issue, but then the maintainers may very well take it badly and not be in a rush help you. Make sure you read the [code of conduct](https://github.com/huggingface/transformers/blob/master/CODE_OF_CONDUCT.md) of the project.
35 | 
36 | ### Including your environment information
37 | 
38 | 🤗 Transformers provides a utility to get all the information we need about your environment. Just type the following in your terminal:
39 | 
40 | ```
41 | transformers-cli env
42 | ```
43 | 
44 | and you should get something like this:
45 | 
46 | ```out
47 | Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.
48 | 
49 | - `transformers` version: 4.12.0.dev0
50 | - Platform: Linux-5.10.61-1-MANJARO-x86_64-with-arch-Manjaro-Linux
51 | - Python version: 3.7.9
52 | - PyTorch version (GPU?): 1.8.1+cu111 (True)
53 | - Tensorflow version (GPU?): 2.5.0 (True)
54 | - Flax version (CPU?/GPU?/TPU?): 0.3.4 (cpu)
55 | - Jax version: 0.2.13
56 | - JaxLib version: 0.1.65
57 | - Using GPU in script?: <fill in>
58 | - Using distributed or parallel set-up in script?: <fill in>
59 | ```
60 | 
61 | You can also add a `!` at the beginning of the `transformers-cli env` command to execute it from a notebook cell, and then copy and paste the result at the beginning of your issue.
62 | 
63 | ### Tagging people 
64 | 
65 | Tagging people by typing an `@` followed by their GitHub handle will send them a notification so they will see your issue and might reply quicker. Use this with moderation, because the people you tag might not appreciate being notified if it's something they have no direct link to. If you have looked at the source files related to your bug, you should tag the last person that made changes at the line you think is responsible for your problem (you can find this information by looking at said line on GitHub, selecting it, then clicking "View git blame").
66 | 
67 | Otherwise, the template offers suggestions of people to tag. In general, never tag more than three people!
68 | 
69 | ### Including a reproducible example
70 | 
71 | If you have managed to create a self-contained example that produces the bug, now is the time to include it! Type a line with three backticks followed by `python`, like this:
72 | 
73 | ```
74 | ```python
75 | ```
76 | 
77 | then paste in your minimal reproducible example and type a new line with three backticks. This will ensure your code is properly formatted.
78 | 
79 | If you didn't manage to create a reproducible example, explain in clear steps how you got to your issue. Include a link to a Google Colab notebook where you got the error if you can. The more information you share, the better able the maintainers will be to reply to you.
80 | 
81 | In all cases, you should copy and paste the whole error message you are getting. If you're working in Colab, remember that some of the frames may be automatically collapsed in the stack trace, so make sure you expand them before copying. Like with the code sample, put that error message between two lines with three backticks, so it's properly formatted.
82 | 
83 | ### Describing the expected behavior
84 | 
85 | Explain in a few lines what you expected to get, so that the maintainers get a full grasp of the problem. This part is generally pretty obvious, so it should fit in one sentence, but in some cases you may have a lot to say.
86 | 
87 | ## And then what?
88 | 
89 | Once your issue is filed, make sure to quickly check everything looks okay. You can edit the issue if you made a mistake, or even change its title if you realize the problem is different from what you initially thought.
90 | 
91 | There is no point pinging people if you don't get an answer. If no one helps you in a few days, it's likely that no one could make sense of your problem. Don't hesitate to go back to the reproducible example. Can you make it shorter and more to the point? If you don't get an answer in a week, you can leave a message gently asking for help, especially if you've edited your issue to include more information on the problem.
92 | 
93 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # The Hugging Face Course
 2 | 
 3 | This repo contains the content that's used to create the **[Hugging Face course](https://huggingface.co/course/chapter1/1)**. The course teaches you about applying Transformers to various tasks in natural language processing and beyond. Along the way, you'll learn how to use the [Hugging Face](https://huggingface.co/) ecosystem — [🤗 Transformers](https://github.com/huggingface/transformers), [🤗 Datasets](https://github.com/huggingface/datasets), [🤗 Tokenizers](https://github.com/huggingface/tokenizers), and [🤗 Accelerate](https://github.com/huggingface/accelerate) — as well as the [Hugging Face Hub](https://huggingface.co/models). It's completely free and open-source!
 4 | 
 5 | ## 🌎 Languages and translations
 6 | 
 7 | | Language                                               | Source        | Authors                                                                                                                                                                                                                                                                                                        |
 8 | |:-------------------------------------------------------|:--------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 9 | | [English](https://huggingface.co/course/en/chapter1/1) | [`chapters/en`](https://github.com/huggingface/course/tree/main/chapters/en) | [@sgugger](https://github.com/sgugger), [@lewtun](https://github.com/lewtun), [@LysandreJik](https://github.com/LysandreJik), [@Rocketknight1](https://github.com/Rocketknight1), [@sashavor](https://github.com/sashavor), [@osanseviero](https://github.com/osanseviero), [@SaulLu](https://github.com/SaulLu), [@lvwerra](https://github.com/lvwerra) |
10 | 
11 | ### Translating the course into your language
12 | 
13 | As part of our mission to democratise machine learning, we'd love to have the course available in many more languages! Please follow the steps below if you'd like to help translate the course into your language 🙏.
14 | 
15 | **🗞️ Open an issue**
16 | 
17 | To get started, navigate to the [_Issues_](https://github.com/huggingface/course/issues) page of this repo and check if anyone else has opened an issue for your language. If not, open a new issue by selecting the _Translation template_ from the _New issue_ button.
18 | 
19 | Once an issue is created, post a comment to indicate which chapters you'd like to work on and we'll add your name to the list.
20 | 
21 | **🍴 Fork the repository**
22 | 
23 | Next, you'll need to [fork this repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo). You can do this by clicking on the **Fork** button on the top-right corner of this repo's page.
24 | 
25 | Once you've forked the repo, you'll want to get the files on your local machine for editing. You can do that by cloning the fork with Git as follows:
26 | 
27 | ```bash
28 | git clone https://github.com/YOUR-USERNAME/course
29 | ```
30 | 
31 | **📋 Copy-paste the English files with a new language code**
32 | 
33 | The course files are organised under a main directory:
34 | 
35 | * [`chapters`](https://github.com/huggingface/course/tree/main/chapters): all the text and code snippets associated with the course.
36 | 
37 | You'll only need to copy the files in the [`chapters/en`](https://github.com/huggingface/course/tree/main/chapters/en) directory, so first navigate to your fork of the repo and run the following:
38 | 
39 | ```bash
40 | cd ~/path/to/course
41 | cp -r chapters/en/CHAPTER-NUMBER chapters/LANG-ID/CHAPTER-NUMBER
42 | ```
43 | 
44 | Here, `CHAPTER-NUMBER` refers to the chapter you'd like to work on and `LANG-ID` should be one of the ISO 639-1 or ISO 639-2 language codes -- see [here](https://www.loc.gov/standards/iso639-2/php/code_list.php) for a handy table.
45 | 
46 | **✍️ Start translating**
47 | 
48 | Now comes the fun part - translating the text! The first thing we recommend is translating the part of the `_toctree.yml` file that corresponds to your chapter. This file is used to render the table of contents on the website and provide the links to the Colab notebooks. The only fields you should change are the `title`, ones -- for example, here are the parts of `_toctree.yml` that we'd translate for [Chapter 0](https://huggingface.co/course/chapter0/1?fw=pt):
49 | 
50 | ```yaml
51 | - title: 0. Setup # Translate this!
52 |   sections:
53 |   - local: chapter0/1 # Do not change this!
54 |     title: Introduction # Translate this!
55 | ```
56 | 
57 | Once you have translated the `_toctree.yml` file, you can start translating the [MDX](https://mdxjs.com/) files associated with your chapter.
58 | 
59 | > 🙋 If the `_toctree.yml` file doesn't yet exist for your language, you can simply create one by copy-pasting from the English version and deleting the sections that aren't related to your chapter. Just make sure it exists in the `chapters/LANG-ID/` directory!
60 | 
61 | ## 📔 Jupyter notebooks
62 | 
63 | The Jupyter notebooks containing all the code from the course are hosted on the [`huggingface/notebooks`](https://github.com/huggingface/notebooks) repo. If you wish to generate them locally, first install the required dependencies:
64 | 
65 | ```bash
66 | python -m pip install -r requirements.txt
67 | ```
68 | 
69 | Then run the following script:
70 | 
71 | ```bash
72 | python utils/generate_notebooks.py --output_dir nbs
73 | ```
74 | 
75 | This script extracts all the code snippets from the English chapters and stores them as notebooks in the `nbs` folder (which is ignored by Git by default).
76 | 
77 | ## ✍️ Contributing a new chapter
78 | 
79 | > Note: we are not currently accepting community contributions for new chapters. These instructions are for the Hugging Face authors.
80 | 
81 | Adding a new chapter to the course is quite simple:
82 | 
83 | 1. Create a new directory under `chapters/en/chapterX`, where `chapterX` is the chapter you'd like to add.
84 | 2. Add numbered MDX files `sectionX.mdx` for each section. If you need to include images, place them in the [huggingface-course/documentation-images](https://huggingface.co/datasets/huggingface-course/documentation-images) repository and use the [HTML Images Syntax](https://www.w3schools.com/html/html_images.asp) with the path `https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/{langY}/{chapterX}/{your-image.png}`.
85 | 3. Update the `_toctree.yml` file to include your chapter sections -- this information will render the table of contents on the website. If your section involves both the PyTorch and TensorFlow APIs of `transformers`, make sure you include links to both Colabs in the `colab` field.
86 | 
87 | If you get stuck, check out one of the existing chapters -- this will often show you the expected syntax.
88 | 
89 | Once you are happy with the content, open a pull request and tag [@lewtun](https://github.com/lewtun) for a review. We recommend adding the first chapter draft as a single pull request -- the team will then provide feedback internally to iterate on the content 🤗!
90 | 
91 | ## 🙌 Acknowledgements
92 | 
93 | The structure of this repo and README are inspired by the wonderful [Advanced NLP with spaCy](https://github.com/ines/spacy-course) course.


--------------------------------------------------------------------------------
/chapters/en/chapter2/6.mdx:
--------------------------------------------------------------------------------
  1 | <FrameworkSwitchCourse {fw} />
  2 | 
  3 | # Putting it all together
  4 | 
  5 | {#if fw === 'pt'}
  6 | 
  7 | <DocNotebookDropdown
  8 |   classNames="absolute z-10 right-0 top-0"
  9 |   options={[
 10 |     {label: "Google Colab", value: "https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter2/section6_pt.ipynb"},
 11 |     {label: "Aws Studio", value: "https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/chapter2/section6_pt.ipynb"},
 12 | ]} />
 13 | 
 14 | {:else}
 15 | 
 16 | <DocNotebookDropdown
 17 |   classNames="absolute z-10 right-0 top-0"
 18 |   options={[
 19 |     {label: "Google Colab", value: "https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter2/section6_tf.ipynb"},
 20 |     {label: "Aws Studio", value: "https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/chapter2/section6_tf.ipynb"},
 21 | ]} />
 22 | 
 23 | {/if}
 24 | 
 25 | In the last few sections, we've been trying our best to do most of the work by hand. We've explored how tokenizers work and looked at tokenization, conversion to input IDs, padding, truncation, and attention masks.
 26 | 
 27 | However, as we saw in section 2, the 🤗 Transformers API can handle all of this for us with a high-level function that we'll dive into here. When you call your `tokenizer` directly on the sentence, you get back inputs that are ready to pass through your model:
 28 | 
 29 | ```py
 30 | from transformers import AutoTokenizer
 31 | 
 32 | checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
 33 | tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 34 | 
 35 | sequence = "I've been waiting for a HuggingFace course my whole life."
 36 | 
 37 | model_inputs = tokenizer(sequence)
 38 | ```
 39 | 
 40 | Here, the `model_inputs` variable contains everything that's necessary for a model to operate well. For DistilBERT, that includes the input IDs as well as the attention mask. Other models that accept additional inputs will also have those output by the `tokenizer` object.
 41 | 
 42 | As we'll see in some examples below, this method is very powerful. First, it can tokenize a single sequence:
 43 | 
 44 | ```py
 45 | sequence = "I've been waiting for a HuggingFace course my whole life."
 46 | 
 47 | model_inputs = tokenizer(sequence)
 48 | ```
 49 | 
 50 | It also handles multiple sequences at a time, with no change in the API:
 51 | 
 52 | ```py
 53 | sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
 54 | 
 55 | model_inputs = tokenizer(sequences)
 56 | ```
 57 | 
 58 | It can pad according to several objectives:
 59 | 
 60 | ```py
 61 | # Will pad the sequences up to the maximum sequence length
 62 | model_inputs = tokenizer(sequences, padding="longest")
 63 | 
 64 | # Will pad the sequences up to the model max length
 65 | # (512 for BERT or DistilBERT)
 66 | model_inputs = tokenizer(sequences, padding="max_length")
 67 | 
 68 | # Will pad the sequences up to the specified max length
 69 | model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
 70 | ```
 71 | 
 72 | It can also truncate sequences:
 73 | 
 74 | ```py
 75 | sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
 76 | 
 77 | # Will truncate the sequences that are longer than the model max length
 78 | # (512 for BERT or DistilBERT)
 79 | model_inputs = tokenizer(sequences, truncation=True)
 80 | 
 81 | # Will truncate the sequences that are longer than the specified max length
 82 | model_inputs = tokenizer(sequences, max_length=8, truncation=True)
 83 | ```
 84 | 
 85 | The `tokenizer` object can handle the conversion to specific framework tensors, which can then be directly sent to the model. For example, in the following code sample we are prompting the tokenizer to return tensors from the different frameworks — `"pt"` returns PyTorch tensors, `"tf"` returns TensorFlow tensors, and `"np"` returns NumPy arrays:
 86 | 
 87 | ```py
 88 | sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
 89 | 
 90 | # Returns PyTorch tensors
 91 | model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")
 92 | 
 93 | # Returns TensorFlow tensors
 94 | model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")
 95 | 
 96 | # Returns NumPy arrays
 97 | model_inputs = tokenizer(sequences, padding=True, return_tensors="np")
 98 | ```
 99 | 
100 | ## Special tokens
101 | 
102 | If we take a look at the input IDs returned by the tokenizer, we will see they are a tiny bit different from what we had earlier:
103 | 
104 | ```py
105 | sequence = "I've been waiting for a HuggingFace course my whole life."
106 | 
107 | model_inputs = tokenizer(sequence)
108 | print(model_inputs["input_ids"])
109 | 
110 | tokens = tokenizer.tokenize(sequence)
111 | ids = tokenizer.convert_tokens_to_ids(tokens)
112 | print(ids)
113 | ```
114 | 
115 | ```python out
116 | [101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
117 | [1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
118 | ```
119 | 
120 | One token ID was added at the beginning, and one at the end. Let's decode the two sequences of IDs above to see what this is about:
121 | 
122 | ```py
123 | print(tokenizer.decode(model_inputs["input_ids"]))
124 | print(tokenizer.decode(ids))
125 | ```
126 | 
127 | ```python out
128 | "[CLS] i've been waiting for a huggingface course my whole life. [SEP]"
129 | "i've been waiting for a huggingface course my whole life."
130 | ```
131 | 
132 | The tokenizer added the special word `[CLS]` at the beginning and the special word `[SEP]` at the end. This is because the model was pretrained with those, so to get the same results for inference we need to add them as well. Note that some models don't add special words, or add different ones; models may also add these special words only at the beginning, or only at the end. In any case, the tokenizer knows which ones are expected and will deal with this for you.
133 | 
134 | ## Wrapping up: From tokenizer to model
135 | 
136 | Now that we've seen all the individual steps the `tokenizer` object uses when applied on texts, let's see one final time how it can handle multiple sequences (padding!), very long sequences (truncation!), and multiple types of tensors with its main API:
137 | 
138 | {#if fw === 'pt'}
139 | ```py
140 | import torch
141 | from transformers import AutoTokenizer, AutoModelForSequenceClassification
142 | 
143 | checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
144 | tokenizer = AutoTokenizer.from_pretrained(checkpoint)
145 | model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
146 | sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
147 | 
148 | tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
149 | output = model(**tokens)
150 | ```
151 | {:else}
152 | ```py
153 | import tensorflow as tf
154 | from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
155 | 
156 | checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
157 | tokenizer = AutoTokenizer.from_pretrained(checkpoint)
158 | model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
159 | sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
160 | 
161 | tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="tf")
162 | output = model(**tokens)
163 | ```
164 | {/if}
165 | 


--------------------------------------------------------------------------------
/chapters/en/chapter4/6.mdx:
--------------------------------------------------------------------------------
  1 | <FrameworkSwitchCourse {fw} />
  2 | 
  3 | <!-- DISABLE-FRONTMATTER-SECTIONS -->
  4 | 
  5 | # End-of-chapter quiz
  6 | 
  7 | Let's test what you learned in this chapter!
  8 |   
  9 | ### 1. What are models on the Hub limited to?
 10 | 
 11 | <Question
 12 | 	choices={[
 13 | 		{
 14 | 			text: "Models from the 🤗 Transformers library.",
 15 | 			explain: "While models from the 🤗 Transformers library are supported on the Hugging Face Hub, they're not the only ones!"
 16 | 		},
 17 | 		{
 18 | 			text: "All models with a similar interface to 🤗 Transformers.",
 19 | 			explain: "No interface requirement is set when uploading models to the Hugging Face Hub. "
 20 | 		},
 21 | 		{
 22 | 			text: "There are no limits.",
 23 | 			explain: "Right! There are no limits when uploading models to the Hub.",
 24 |             correct: true
 25 | 		},
 26 |         {
 27 | 			text: "Models that are in some way related to NLP.",
 28 | 			explain: "No requirement is set regarding the field of application!"
 29 | 		}
 30 | 	]}
 31 | />
 32 | 
 33 | ### 2. How can you manage models on the Hub?
 34 | 
 35 | <Question
 36 | 	choices={[
 37 | 		{
 38 | 			text: "Through a GCP account.",
 39 | 			explain: "Incorrect!"
 40 | 		},
 41 | 		{
 42 | 			text: "Through peer-to-peer distribution.",
 43 | 			explain: "Incorrect!"
 44 | 		},
 45 | 		{
 46 | 			text: "Through git and git-lfs.",
 47 | 			explain: "Correct! Models on the Hub are simple Git repositories, leveraging <code>git-lfs</code> for large files.",
 48 |             correct: true
 49 | 		}
 50 | 	]}
 51 | />
 52 | 
 53 | ### 3. What can you do using the Hugging Face Hub web interface? 
 54 | 
 55 | <Question
 56 | 	choices={[
 57 | 		{
 58 | 			text: "Fork an existing repository.",
 59 | 			explain: "Forking a repository is not possible on the Hugging Face Hub."
 60 | 		},
 61 | 		{
 62 | 			text: "Create a new model repository.",
 63 | 			explain: "Correct! That's not all you can do, though.",
 64 |             correct: true
 65 | 		},
 66 | 		{
 67 | 			text: "Manage and edit files.",
 68 | 			explain: "Correct! That's not the only right answer, though.",
 69 |             correct: true
 70 | 		},
 71 |         {
 72 | 			text: "Upload files.",
 73 | 			explain: "Right! But that's not all.",
 74 |             correct: true
 75 | 		},
 76 |         {
 77 | 			text: "See diffs across versions.",
 78 | 			explain: "Correct! That's not all you can do, though.",
 79 |             correct: true
 80 | 		}
 81 | 	]}
 82 | />
 83 | 
 84 | ### 4. What is a model card?
 85 | 
 86 | <Question
 87 | 	choices={[
 88 | 		{
 89 | 			text: "A rough description of the model, therefore less important than the model and tokenizer files.",
 90 | 			explain: "It is indeed a description of the model, but it's an important piece: if it's incomplete or absent the model's utility is drastically reduced."
 91 | 		},
 92 | 		{
 93 | 			text: "A way to ensure reproducibility, reusability, and fairness.",
 94 | 			explain: "Correct! Sharing the right information in the model card will help users leverage your model and be aware of its limits and biases. ",
 95 |             correct: true
 96 | 		},
 97 | 		{
 98 | 			text: "A Python file that can be run to retrieve information about the model.",
 99 | 			explain: "Model cards are simple Markdown files."
100 | 		}
101 | 	]}
102 | />
103 | 
104 | ### 5. Which of these objects of the 🤗 Transformers library can be directly shared on the Hub with `push_to_hub()`?
105 | 
106 | {#if fw === 'pt'}
107 | <Question
108 | 	choices={[
109 | 		{
110 | 			text: "A tokenizer",
111 | 			explain: "Correct! All tokenizers have the <code>push_to_hub</code> method, and using it will push all the tokenizer files (vocabulary, architecture of the tokenizer, etc.) to a given repo. That's not the only right answer, though!",
112 |             correct: true
113 | 		},
114 | 		{
115 | 			text: "A model configuration",
116 | 			explain: "Right! All model configurations have the <code>push_to_hub</code> method, and using it will push them to a given repo. What else can you share?",
117 |             correct: true
118 | 		},
119 | 		{
120 | 			text: "A model",
121 | 			explain: "Correct! All models have the <code>push_to_hub</code> method, and using it will push them and their configuration files to a given repo. That's not all you can share, though.",
122 |             correct: true
123 | 		},
124 |         {
125 | 			text: "A Trainer",
126 | 			explain: "That's right — the <code>Trainer</code> also implements the <code>push_to_hub</code> method, and using it will upload the model, its configuration, the tokenizer, and a model card draft to a given repo. Try another answer!",
127 |             correct: true
128 | 		}
129 | 	]}
130 | />
131 | {:else}
132 | <Question
133 | 	choices={[
134 | 		{
135 | 			text: "A tokenizer",
136 | 			explain: "Correct! All tokenizers have the <code>push_to_hub</code> method, and using it will push all the tokenizer files (vocabulary, architecture of the tokenizer, etc.) to a given repo. That's not the only right answer, though!",
137 |             correct: true
138 | 		},
139 | 		{
140 | 			text: "A model configuration",
141 | 			explain: "Right! All model configurations have the <code>push_to_hub</code> method, and using it will push them to a given repo. What else can you share?",
142 |             correct: true
143 | 		},
144 | 		{
145 | 			text: "A model",
146 | 			explain: "Correct! All models have the <code>push_to_hub</code> method, and using it will push them and their configuration files to a given repo. That's not all you can share, though.",
147 |             correct: true
148 | 		},
149 | 		{
150 | 			text: "All of the above with a dedicated callback",
151 | 			explain: "That's right — the <code>PushToHubCallback</code> will regularly send all of those objects to a repo during training.",
152 |             correct: true
153 | 		}
154 | 	]}
155 | />
156 | {/if}
157 | 
158 | ### 6. What is the first step when using the `push_to_hub()` method or the CLI tools?
159 | 
160 | <Question
161 | 	choices={[
162 | 		{
163 | 			text: "Log in on the website.",
164 | 			explain: "This won't help you on your local machine."
165 | 		},
166 | 		{
167 | 			text: "Run 'huggingface-cli login' in a terminal.",
168 | 			explain: "Correct — this will download and cache your personal token.",
169 |             correct: true
170 | 		},
171 | 		{
172 | 			text: "Run 'notebook_login()' in a notebook.",
173 | 			explain: "Correct — this will display a widget to let you authenticate.",
174 |             correct: true
175 | 		},
176 | 	]}
177 | />
178 | 
179 | ### 7. You're using a model and a tokenizer — how can you upload them to the Hub?
180 | 
181 | <Question
182 | 	choices={[
183 | 		{
184 | 			text: "By calling the push_to_hub method directly on the model and the tokenizer.",
185 | 			explain: "Correct!",
186 |             correct: true
187 | 		},
188 | 		{
189 | 			text: "Within the Python runtime, by wrapping them in a <code>huggingface_hub</code> utility.",
190 | 			explain: "Models and tokenizers already benefit from <code>huggingface_hub</code> utilities: no need for additional wrapping!"
191 | 		},
192 | 		{
193 | 			text: "By saving them to disk and calling <code>transformers-cli upload-model</code>",
194 | 			explain: "The command <code>upload-model</code> does not exist."
195 | 		}
196 | 	]}
197 | />
198 | 
199 | ### 8. Which git operations can you do with the `Repository` class?
200 | 
201 | <Question
202 | 	choices={[
203 | 		{
204 | 			text: "A commit.",
205 | 			explain: "Correct, the <code>git_commit()</code> method is there for that.",
206 |             correct: true
207 | 		},
208 | 		{
209 | 			text: "A pull",
210 | 			explain: "That is the purpose of the <code>git_pull()</code> method.",
211 |             correct: true
212 | 		},
213 | 		{
214 | 			text: "A push",
215 | 			explain: "The method <code>git_push()</code> does this.",
216 |             correct: true
217 | 		},
218 | 		{
219 | 			text: "A merge",
220 | 			explain: "No, that operation will never be possible with this API."
221 | 		}
222 | 	]}
223 | />
224 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/translations.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | name: Translation template
  3 | about: 🤝 Translating the course to another language
  4 | title: ''
  5 | labels: translation
  6 | assignees: ''
  7 | 
  8 | ---
  9 | 
 10 | <!-- You should replace `YOUR-LANG` below with the name of your language. Also, please add a descriptive title for the issue like "Translation to YOUR-LANG" -->
 11 | 
 12 | Hi there 👋
 13 | 
 14 | Let's translate the course to `YOUR-LANG` so that the whole community can benefit from this resource 🌎!
 15 | 
 16 | Below are the chapters and files that need translating - let us know here if you'd like to translate any and we'll add your name to the list. Once you're finished, open a pull request and tag this issue by including `#issue-number` in the description, where `issue-number` is the number of this issue.
 17 | 
 18 | > 🙋 If you'd like others to help you with the translation, you can also post in our [forums](https://discuss.huggingface.co/c/course/20) or tag [@_lewtun](https://twitter.com/_lewtun) on Twitter to gain some visibility.
 19 | 
 20 | ## Chapters
 21 | 
 22 | **0 - Setup**
 23 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter0/1.mdx)
 24 | 
 25 | **1 - Transformer models**
 26 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/1.mdx)
 27 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/2.mdx)
 28 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/3.mdx)
 29 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/4.mdx)
 30 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/5.mdx)
 31 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/6.mdx)
 32 | - [ ] [`7.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/7.mdx)
 33 | - [ ] [`8.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/8.mdx)
 34 | - [ ] [`9.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/9.mdx)
 35 | - [ ] [`10.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter1/10.mdx)
 36 | 
 37 | **2 - Using 🤗 Transformers**
 38 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/1.mdx)
 39 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/2.mdx)
 40 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/3.mdx)
 41 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/4.mdx)
 42 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/5.mdx)
 43 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/6.mdx)
 44 | - [ ] [`7.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/7.mdx)
 45 | - [ ] [`8.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter2/8.mdx)
 46 | 
 47 | **3 - Fine-tuning a pretrained model**
 48 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter3/1.mdx)
 49 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter3/2.mdx)
 50 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter3/3.mdx)
 51 | - [ ] [`3_tf.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter3/3_tf.mdx)
 52 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter3/4.mdx)
 53 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter3/5.mdx)
 54 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter3/6.mdx)
 55 | 
 56 | **4 - Sharing models and tokenizers**
 57 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter4/1.mdx)
 58 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter4/2.mdx)
 59 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter4/3.mdx)
 60 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter4/4.mdx)
 61 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter4/5.mdx)
 62 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter4/6.mdx)
 63 | 
 64 | **5 - The 🤗 Datasets library**
 65 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/1.mdx)
 66 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/2.mdx)
 67 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/3.mdx)
 68 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/4.mdx)
 69 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/5.mdx)
 70 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/6.mdx)
 71 | - [ ] [`7.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/7.mdx)
 72 | - [ ] [`8.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter5/8.mdx)
 73 | 
 74 | **6 -  The 🤗 Tokenizers library**
 75 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/1.mdx)
 76 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/2.mdx)
 77 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/3.mdx)
 78 | - [ ] [`3b.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/3b.mdx)
 79 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/4.mdx)
 80 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/5.mdx)
 81 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/6.mdx)
 82 | - [ ] [`7.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/7.mdx)
 83 | - [ ] [`8.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/8.mdx)
 84 | - [ ] [`9.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/9.mdx)
 85 | - [ ] [`10.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter6/10.mdx)
 86 | 
 87 | **7 - Main NLP tasks**
 88 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/1.mdx)
 89 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/2.mdx)
 90 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/3.mdx)
 91 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/4.mdx)
 92 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/5.mdx)
 93 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/6.mdx)
 94 | - [ ] [`7.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/7.mdx)
 95 | - [ ] [`8.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/8.mdx)
 96 | - [ ] [`9.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter7/9.mdx)
 97 | 
 98 | **8 - How to ask for help**
 99 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/1.mdx)
100 | - [ ] [`2.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/2.mdx)
101 | - [ ] [`3.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/3.mdx)
102 | - [ ] [`4.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/4.mdx)
103 | - [ ] [`4_tf.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/4_tf.mdx)
104 | - [ ] [`5.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/5.mdx)
105 | - [ ] [`6.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/6.mdx)
106 | - [ ] [`7.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/chapter8/7.mdx)
107 | 
108 | **Events**
109 | - [ ] [`1.mdx`](https://github.com/huggingface/course/blob/main/chapters/en/event/1.mdx)


--------------------------------------------------------------------------------
/chapters/en/chapter6/4.mdx:
--------------------------------------------------------------------------------
  1 | # Normalization and pre-tokenization
  2 | 
  3 | <DocNotebookDropdown
  4 |   classNames="absolute z-10 right-0 top-0"
  5 |   options={[
  6 |     {label: "Google Colab", value: "https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter6/section4.ipynb"},
  7 |     {label: "Aws Studio", value: "https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/chapter6/section4.ipynb"},
  8 | ]} />
  9 | 
 10 | Before we dive more deeply into the three most common subword tokenization algorithms used with Transformer models (Byte-Pair Encoding [BPE], WordPiece, and Unigram), we'll first take a look at the preprocessing that each tokenizer applies to text. Here's a high-level overview of the steps in the tokenization pipeline:
 11 | 
 12 | <div class="flex justify-center">
 13 | <img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter6/tokenization_pipeline.svg" alt="The tokenization pipeline.">
 14 | <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter6/tokenization_pipeline-dark.svg" alt="The tokenization pipeline.">
 15 | </div>
 16 | 
 17 | Before splitting a text into subtokens (according to its model), the tokenizer performs two steps: _normalization_ and _pre-tokenization_.
 18 | 
 19 | ## Normalization
 20 | 
 21 | <Youtube id="4IIC2jI9CaU"/>
 22 | 
 23 | The normalization step involves some general cleanup, such as removing needless whitespace, lowercasing, and/or removing accents. If you're familiar with [Unicode normalization](http://www.unicode.org/reports/tr15/) (such as NFC or NFKC), this is also something the tokenizer may apply.
 24 | 
 25 | The 🤗 Transformers `tokenizer` has an attribute called `backend_tokenizer` that provides access to the underlying tokenizer from the 🤗 Tokenizers library:
 26 | 
 27 | ```py
 28 | from transformers import AutoTokenizer
 29 | 
 30 | tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 31 | print(type(tokenizer.backend_tokenizer))
 32 | ```
 33 | 
 34 | ```python out
 35 | <class 'tokenizers.Tokenizer'>
 36 | ```
 37 | 
 38 | The `normalizer` attribute of the `tokenizer` object has a `normalize_str()` method that we can use to see how the normalization is performed:
 39 | 
 40 | ```py
 41 | print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
 42 | ```
 43 | 
 44 | ```python out
 45 | 'hello how are u?'
 46 | ```
 47 | 
 48 | In this example, since we picked the `bert-base-uncased` checkpoint, the normalization applied lowercasing and removed the accents. 
 49 | 
 50 | <Tip>
 51 | 
 52 | ✏️ **Try it out!** Load a tokenizer from the `bert-base-cased` checkpoint and pass the same example to it. What are the main differences you can see between the cased and uncased versions of the tokenizer?
 53 | 
 54 | </Tip>
 55 | 
 56 | ## Pre-tokenization
 57 | 
 58 | <Youtube id="grlLV8AIXug"/>
 59 | 
 60 | As we will see in the next sections, a tokenizer cannot be trained on raw text alone. Instead, we first need to split the texts into small entities, like words. That's where the pre-tokenization step comes in. As we saw in [Chapter 2](/course/chapter2), a word-based tokenizer can simply split a raw text into words on whitespace and punctuation. Those words will be the boundaries of the subtokens the tokenizer can learn during its training.
 61 | 
 62 | To see how a fast tokenizer performs pre-tokenization, we can use the `pre_tokenize_str()` method of the `pre_tokenizer` attribute of the `tokenizer` object:
 63 | 
 64 | ```py
 65 | tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")
 66 | ```
 67 | 
 68 | ```python out
 69 | [('Hello', (0, 5)), (',', (5, 6)), ('how', (7, 10)), ('are', (11, 14)), ('you', (16, 19)), ('?', (19, 20))]
 70 | ```
 71 | 
 72 | Notice how the tokenizer is already keeping track of the offsets, which is how it can give us the offset mapping we used in the previous section. Here the tokenizer ignores the two spaces and replaces them with just one, but the offset jumps between `are` and `you` to account for that.
 73 | 
 74 | Since we're using a BERT tokenizer, the pre-tokenization involves splitting on whitespace and punctuation. Other tokenizers can have different rules for this step. For example, if we use the GPT-2 tokenizer:
 75 | 
 76 | ```py
 77 | tokenizer = AutoTokenizer.from_pretrained("gpt2")
 78 | tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")
 79 | ```
 80 | 
 81 | it will split on whitespace and punctuation as well, but it will keep the spaces and replace them with a `Ġ` symbol, enabling it to recover the original spaces if we decode the tokens:
 82 | 
 83 | ```python out
 84 | [('Hello', (0, 5)), (',', (5, 6)), ('Ġhow', (6, 10)), ('Ġare', (10, 14)), ('Ġ', (14, 15)), ('Ġyou', (15, 19)),
 85 |  ('?', (19, 20))]
 86 | ```
 87 | 
 88 | Also note that unlike the BERT tokenizer, this tokenizer does not ignore the double space.
 89 | 
 90 | For a last example, let's have a look at the T5 tokenizer, which is based on the SentencePiece algorithm:
 91 | 
 92 | ```py
 93 | tokenizer = AutoTokenizer.from_pretrained("t5-small")
 94 | tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")
 95 | ```
 96 | 
 97 | ```python out
 98 | [('▁Hello,', (0, 6)), ('▁how', (7, 10)), ('▁are', (11, 14)), ('▁you?', (16, 20))]
 99 | ```
100 | 
101 | Like the GPT-2 tokenizer, this one keeps spaces and replaces them with a specific token (`_`), but the T5 tokenizer only splits on whitespace, not punctuation. Also note that it added a space by default at the beginning of the sentence (before `Hello`) and ignored the double space between `are` and `you`.
102 | 
103 | Now that we've seen a little of how some different tokenizers process text, we can start to explore the underlying algorithms themselves. We'll begin with a quick look at the broadly widely applicable SentencePiece; then, over the next three sections, we'll examine how the three main algorithms used for subword tokenization work.
104 | 
105 | ## SentencePiece
106 | 
107 | [SentencePiece](https://github.com/google/sentencepiece) is a tokenization algorithm for the preprocessing of text that you can use with any of the models we will see in the next three sections. It considers the text as a sequence of Unicode characters, and replaces spaces with a special character, `▁`. Used in conjunction with the Unigram algorithm (see [section 7](/course/chapter7/7)), it doesn't even require a pre-tokenization step, which is very useful for languages where the space character is not used (like Chinese or Japanese).
108 | 
109 | The other main feature of SentencePiece is *reversible tokenization*: since there is no special treatment of spaces, decoding the tokens is done simply by concatenating them and replacing the `_`s with spaces -- this results in the normalized text. As we saw earlier, the BERT tokenizer removes repeating spaces, so its tokenization is not reversible.
110 | 
111 | ## Algorithm overview
112 | 
113 | In the following sections, we'll dive into the three main subword tokenization algorithms: BPE (used by GPT-2 and others), WordPiece (used for example by BERT), and Unigram (used by T5 and others). Before we get started, here's a quick overview of how they each work. Don't hesitate to come back to this table after reading each of the next sections if it doesn't make sense to you yet.
114 | 
115 | 
116 | Model | BPE | WordPiece | Unigram
117 | :----:|:---:|:---------:|:------:
118 | Training | Starts from a small vocabulary and learns rules to merge tokens |  Starts from a small vocabulary and learns rules to merge tokens | Starts from a large vocabulary and learns rules to remove tokens
119 | Training step | Merges the tokens corresponding to the most common pair | Merges the tokens corresponding to the pair with the best score based on the frequency of the pair, privileging pairs where each individual token is less frequent | Removes all the tokens in the vocabulary that will minimize the loss computed on the whole corpus
120 | Learns | Merge rules and a vocabulary | Just a vocabulary | A vocabulary with a score for each token
121 | Encoding | Splits a word into characters and applies the merges learned during training | Finds the longest subword starting from the beginning that is in the vocabulary, then does the same for the rest of the word | Finds the most likely split into tokens, using the scores learned during training
122 | 
123 | Now let's dive into BPE!


--------------------------------------------------------------------------------
/chapters/en/chapter5/2.mdx:
--------------------------------------------------------------------------------
  1 | # What if my dataset isn't on the Hub?
  2 | 
  3 | <DocNotebookDropdown
  4 |   classNames="absolute z-10 right-0 top-0"
  5 |   options={[
  6 |     {label: "Google Colab", value: "https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter5/section2.ipynb"},
  7 |     {label: "Aws Studio", value: "https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/chapter5/section2.ipynb"},
  8 | ]} />
  9 | 
 10 | You know how to use the [Hugging Face Hub](https://huggingface.co/datasets) to download datasets, but you'll often find yourself working with data that is stored either on your laptop or on a remote server. In this section we'll show you how 🤗 Datasets can be used to load datasets that aren't available on the Hugging Face Hub.
 11 | 
 12 | <Youtube id="HyQgpJTkRdE"/>
 13 | 
 14 | ## Working with local and remote datasets
 15 | 
 16 | 🤗 Datasets provides loading scripts to handle the loading of local and remote datasets. It supports several common data formats, such as:
 17 | 
 18 | |    Data format     | Loading script |                         Example                         |
 19 | | :----------------: | :------------: | :-----------------------------------------------------: |
 20 | |     CSV & TSV      |     `csv`      |     `load_dataset("csv", data_files="my_file.csv")`     |
 21 | |     Text files     |     `text`     |    `load_dataset("text", data_files="my_file.txt")`     |
 22 | | JSON & JSON Lines  |     `json`     |   `load_dataset("json", data_files="my_file.jsonl")`    |
 23 | | Pickled DataFrames |    `pandas`    | `load_dataset("pandas", data_files="my_dataframe.pkl")` |
 24 | 
 25 | As shown in the table, for each data format we just need to specify the type of loading script in the `load_dataset()` function, along with a `data_files` argument that specifies the path to one or more files. Let's start by loading a dataset from local files; later we'll see how to do the same with remote files.
 26 | 
 27 | ## Loading a local dataset
 28 | 
 29 | For this example we'll use the [SQuAD-it dataset](https://github.com/crux82/squad-it/), which is a large-scale dataset for question answering in Italian.
 30 | 
 31 | The training and test splits are hosted on GitHub, so we can download them with a simple `wget` command:
 32 | 
 33 | ```python
 34 | !wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
 35 | !wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz
 36 | ```
 37 | 
 38 | This will download two compressed files called *SQuAD_it-train.json.gz* and *SQuAD_it-test.json.gz*, which we can decompress with the Linux `gzip` command:
 39 | 
 40 | ```python
 41 | !gzip -dkv SQuAD_it-*.json.gz
 42 | ```
 43 | 
 44 | ```bash
 45 | SQuAD_it-test.json.gz:	   87.4% -- replaced with SQuAD_it-test.json
 46 | SQuAD_it-train.json.gz:	   82.2% -- replaced with SQuAD_it-train.json
 47 | ```
 48 | 
 49 | We can see that the compressed files have been replaced with _SQuAD_it-train.json_ and _SQuAD_it-text.json_, and that the data is stored in the JSON format.
 50 | 
 51 | <Tip>
 52 | 
 53 | ✎ If you're wondering why there's a `!` character in the above shell commands, that's because we're running them within a Jupyter notebook. Simply remove the prefix if you want to download and unzip the dataset within a terminal.
 54 | 
 55 | </Tip>
 56 | 
 57 | To load a JSON file with the `load_dataset()` function, we just need to know if we're dealing with ordinary JSON (similar to a nested dictionary) or JSON Lines (line-separated JSON). Like many question answering datasets, SQuAD-it uses the nested format, with all the text stored in a `data` field. This means we can load the dataset by specifying the `field` argument as follows:
 58 | 
 59 | ```py
 60 | from datasets import load_dataset
 61 | 
 62 | squad_it_dataset = load_dataset("json", data_files="SQuAD_it-train.json", field="data")
 63 | ```
 64 | 
 65 | By default, loading local files creates a `DatasetDict` object with a `train` split. We can see this by inspecting the `squad_it_dataset` object:
 66 | 
 67 | ```py
 68 | squad_it_dataset
 69 | ```
 70 | 
 71 | ```python out
 72 | DatasetDict({
 73 |     train: Dataset({
 74 |         features: ['title', 'paragraphs'],
 75 |         num_rows: 442
 76 |     })
 77 | })
 78 | ```
 79 | 
 80 | This shows us the number of rows and the column names associated with the training set. We can view one of the examples by indexing into the `train` split as follows:
 81 | 
 82 | ```py
 83 | squad_it_dataset["train"][0]
 84 | ```
 85 | 
 86 | ```python out
 87 | {
 88 |     "title": "Terremoto del Sichuan del 2008",
 89 |     "paragraphs": [
 90 |         {
 91 |             "context": "Il terremoto del Sichuan del 2008 o il terremoto...",
 92 |             "qas": [
 93 |                 {
 94 |                     "answers": [{"answer_start": 29, "text": "2008"}],
 95 |                     "id": "56cdca7862d2951400fa6826",
 96 |                     "question": "In quale anno si è verificato il terremoto nel Sichuan?",
 97 |                 },
 98 |                 ...
 99 |             ],
100 |         },
101 |         ...
102 |     ],
103 | }
104 | ```
105 | 
106 | Great, we've loaded our first local dataset! But while this worked for the training set, what we really want is to include both the `train` and `test` splits in a single `DatasetDict` object so we can apply `Dataset.map()` functions across both splits at once. To do this, we can provide a dictionary to the `data_files` argument that maps each split name to a file associated with that split:
107 | 
108 | ```py
109 | data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
110 | squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
111 | squad_it_dataset
112 | ```
113 | 
114 | ```python out
115 | DatasetDict({
116 |     train: Dataset({
117 |         features: ['title', 'paragraphs'],
118 |         num_rows: 442
119 |     })
120 |     test: Dataset({
121 |         features: ['title', 'paragraphs'],
122 |         num_rows: 48
123 |     })
124 | })
125 | ```
126 | 
127 | This is exactly what we wanted. Now, we can apply various preprocessing techniques to clean up the data, tokenize the reviews, and so on.
128 | 
129 | <Tip>
130 | 
131 | The `data_files` argument of the `load_dataset()` function is quite flexible and can be either a single file path, a list of file paths, or a dictionary that maps split names to file paths. You can also glob files that match a specified pattern according to the rules used by the Unix shell (e.g., you can glob all the JSON files in a directory as a single split by setting `data_files="*.json"`). See the 🤗 Datasets [documentation](https://huggingface.co/docs/datasets/loading.html#local-and-remote-files) for more details.
132 | 
133 | </Tip>
134 | 
135 | The loading scripts in 🤗 Datasets actually support automatic decompression of the input files, so we could have skipped the use of `gzip` by pointing the `data_files` argument directly to the compressed files:
136 | 
137 | ```py
138 | data_files = {"train": "SQuAD_it-train.json.gz", "test": "SQuAD_it-test.json.gz"}
139 | squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
140 | ```
141 | 
142 | This can be useful if you don't want to manually decompress many GZIP files. The automatic decompression also applies to other common formats like ZIP and TAR, so you just need to point `data_files` to the compressed files and you're good to go!
143 | 
144 | Now that you know how to load local files on your laptop or desktop, let's take a look at loading remote files.
145 | 
146 | ## Loading a remote dataset
147 | 
148 | If you're working as a data scientist or coder in a company, there's a good chance the datasets you want to analyze are stored on some remote server. Fortunately, loading remote files is just as simple as loading local ones! Instead of providing a path to local files, we point the `data_files` argument of `load_dataset()` to one or more URLs where the remote files are stored. For example, for the SQuAD-it dataset hosted on GitHub, we can just point `data_files` to the _SQuAD_it-*.json.gz_ URLs as follows:
149 | 
150 | ```py
151 | url = "https://github.com/crux82/squad-it/raw/master/"
152 | data_files = {
153 |     "train": url + "SQuAD_it-train.json.gz",
154 |     "test": url + "SQuAD_it-test.json.gz",
155 | }
156 | squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
157 | ```
158 | 
159 | This returns the same `DatasetDict` object obtained above, but saves us the step of manually downloading and decompressing the _SQuAD_it-*.json.gz_ files. This wraps up our foray into the various ways to load datasets that aren't hosted on the Hugging Face Hub. Now that we've got a dataset to play with, let's get our hands dirty with various data-wrangling techniques!
160 | 
161 | <Tip>
162 | 
163 | ✏️ **Try it out!** Pick another dataset hosted on GitHub or the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php) and try loading it both locally and remotely using the techniques introduced above. For bonus points, try loading a dataset that’s stored in a CSV or text format (see the [documentation](https://huggingface.co/docs/datasets/loading.html#local-and-remote-files) for more information on these formats).
164 | 
165 | </Tip>
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/chapters/en/chapter8/7.mdx:
--------------------------------------------------------------------------------
  1 | <!-- DISABLE-FRONTMATTER-SECTIONS -->
  2 | 
  3 | # End-of-chapter quiz
  4 | 
  5 | Let's test what you learned in this chapter!
  6 | 
  7 | ### 1. In which order should you read a Python traceback?
  8 | 
  9 | <Question
 10 | 	choices={[
 11 | 		{
 12 | 			text: "From top to bottom",
 13 | 			explain: "Try again -- although most other programming languages print the exception at the top, Python is special in this regard."
 14 | 		},
 15 | 		{
 16 | 			text: "From bottom to top",
 17 | 			explain: "Correct! One advantage of Python's tracebacks showing the exception at the bottom is that it's easier to debug when you're working in the terminal and this is the last line you see.",
 18 | 			correct: true
 19 | 		}
 20 | 	]}
 21 | />
 22 | 
 23 | ### 2. What is a minimal reproducible example?
 24 | 
 25 | <Question
 26 | 	choices={[
 27 | 		{
 28 | 			text: "A simple implementation of a Transformer architecture from a research article",
 29 | 			explain: "Although it is very educational to implement your own Transformer models from scratch, this is not what we're talking about here."
 30 | 		},
 31 | 		{
 32 | 			text: "A compact and self-contained block of code that can be run without any external dependencies on private files or data",
 33 | 			explain: "Correct! Minimal reproducible examples help the library's maintainers reproduce the problem you are having, so they can find solutions faster.",
 34 | 			correct: true
 35 | 		},
 36 | 		{
 37 | 			text: "A screenshot of the Python traceback",
 38 | 			explain: "Try again -- although it is tempting to include a screenshot of the error you are facing when filing an issue, this makes it very difficult for others to reproduce the error."
 39 | 		},
 40 | 		{
 41 | 			text: "A notebook that contains your whole analysis, including parts unrelated to the error",
 42 | 			explain: "Not quite -- although it can be helpful to share a Google Colab notebook that shows the error, make sure it is short and only contains the relevant code."
 43 | 		}
 44 | 	]}
 45 | />
 46 | 
 47 | ### 3. Suppose you try to run the following code, which throws an error:
 48 | 
 49 | ```py
 50 | from transformers import GPT3ForSequenceClassification
 51 | 
 52 | # ImportError: cannot import name 'GPT3ForSequenceClassification' from 'transformers' (/Users/lewtun/miniconda3/envs/huggingface/lib/python3.8/site-packages/transformers/__init__.py)
 53 | # ---------------------------------------------------------------------------
 54 | # ImportError                               Traceback (most recent call last)
 55 | # /var/folders/28/k4cy5q7s2hs92xq7_h89_vgm0000gn/T/ipykernel_30848/333858878.py in <module>
 56 | # ----> 1 from transformers import GPT3ForSequenceClassification
 57 | 
 58 | # ImportError: cannot import name 'GPT3ForSequenceClassification' from 'transformers' (/Users/lewtun/miniconda3/envs/huggingface/lib/python3.8/site-packages/transformers/__init__.py)
 59 | ```
 60 | 
 61 | Which of the following might be a good choice for the title of a forum topic to ask for help?
 62 | 
 63 | <Question
 64 | 	choices={[
 65 | 		{
 66 | 			text: "<code>ImportError: cannot import name 'GPT3ForSequenceClassification' from 'transformers' (/Users/lewtun/miniconda3/envs/huggingface/lib/python3.8/site-packages/transformers/__init__.py)</code>",
 67 | 			explain: "Including the last line of the traceback can be descriptive, but this is better reserved for the main body of the topic. Try again!"
 68 | 		},
 69 | 		{
 70 | 			text: "Problem with <code>from transformers import GPT3ForSequenceClassification</code>",
 71 | 			explain: "Try again -- although this provides useful information, it's probably best reserved for the main body of the text.",
 72 | 		},
 73 | 		{
 74 | 			text: "Why can't I import <code>GPT3ForSequenceClassification</code>?",
 75 | 			explain: "Good choice! This title is concise and gives the reader a clue about what might be wrong (i.e., that GPT-3 is not supported in 🤗 Transformers).",
 76 | 			correct: true
 77 | 		},
 78 | 		{
 79 | 			text: "Is GPT-3 supported in 🤗 Transformers?",
 80 | 			explain: "Good one! Using questions as topic titles is a great way to communicate the problem to the community.",
 81 | 			correct: true
 82 | 		}
 83 | 	]}
 84 | />
 85 | 
 86 | ### 4. Suppose you've tried to run `trainer.train()` and are faced with a cryptic error that doesn't tell you exactly where the error is coming from. Which of the following is the first place you should look for errors in your training pipeline?
 87 | 
 88 | <Question
 89 | 	choices={[
 90 | 		{
 91 | 			text: "The optimization step where we compute gradients and perform backpropagation",
 92 | 			explain: "Although there may be bugs in your optimizer, this is usually several steps into the training pipeline, so there are other things to check first. Try again!"
 93 | 		},
 94 | 		{
 95 | 			text: "The evaluation step where we compute metrics",
 96 | 			explain: "Evaluation is usually what you do after training for a full epoch, so you should first check somewhere earlier in the training pipeline.",
 97 | 		},
 98 | 		{
 99 | 			text: "The datasets",
100 | 			explain: "Correct! Looking at your data is almost always the first thing you should do, to make sure the text is encoded appropriately, has the expected features, and so on.",
101 | 			correct: true
102 | 		},
103 | 		{
104 | 			text: "The dataloaders",
105 | 			explain: "Try again -- this is very close to the first thing you should check. Do you remember what object we pass to the dataloaders?"
106 | 		}
107 | 	]}
108 | />
109 | 
110 | ### 5. What is the best way to debug a CUDA error?
111 | 
112 | <Question
113 | 	choices={[
114 | 		{
115 | 			text: "Post the error message on the forums or GitHub.",
116 | 			explain: "That won't help anyone as CUDA error messages are usually very uninformative."
117 | 		},
118 | 		{
119 | 			text: "Execute the same code on the CPU.",
120 | 			explain: "Exactly, that should give you a better error message!",
121 | 			correct: true
122 | 		},
123 | 		{
124 | 			text: "Read the traceback to find out what caused the error.",
125 | 			explain: "That's what you would do for any other error, but CUDA errors are usually not raised where they happened because most CUDA operations are asynchronous."
126 | 		},
127 | 		{
128 | 			text: "Reduce the batch size.",
129 | 			explain: "Reducing the batch size is usually a good strategy for handling CUDA out-of-memory errors, but not for this particular problem. Try again!"
130 | 		},
131 | 		{
132 | 			text: "Restart the Jupyter kernel.",
133 | 			explain: "Try again -- restarting the kernel won't make the error magically go away!",
134 | 		}
135 | 	]}
136 | />
137 | 
138 | ### 6. What is the best way to get an issue on GitHub fixed?
139 | 
140 | <Question
141 | 	choices={[
142 | 		{
143 | 			text: "Post a full reproducible example of the bug.",
144 | 			explain: "Yes, that's the best way to help the maintainers find your bug. What else should you do?",
145 | 			correct: true
146 | 		},
147 | 		{
148 | 			text: "Ask every day for an update.",
149 | 			explain: "That's unlikely to get you any help; people will probably ignore you more.",
150 | 		},
151 | 		{
152 | 			text: "Inspect the source code around the bug and try to find the reason why it happens. Post the results in the issue.",
153 | 			explain: "That will definitely help the maintainers! And if you do find the source of the bug and a fix, you can even open a pull request. What else should you do?",
154 | 			correct: true
155 | 		}
156 | 	]}
157 | />
158 | 
159 | ### 7. Why is overfitting to one batch usually a good debugging technique?
160 | 
161 | <Question
162 | 	choices={[
163 | 		{
164 | 			text: "It isn't; overfitting is always bad and should be avoided.",
165 | 			explain: "When training over the whole dataset, overfitting can indeed be a sign that your model won't generalize well to new examples. For debugging, though, we don't usually train over the whole dataset. Try again!"
166 | 		},
167 | 		{
168 | 			text: "It allows us to verify that the model is able to reduce the loss to zero.",
169 | 			explain: "Correct! With a small batch with as little as two examples, we can quickly verify whether the model is capable of learning.",
170 | 			correct: true
171 | 		},
172 | 		{
173 | 			text: "It allows us to verify that the tensor shapes of our inputs and labels are correct.",
174 | 			explain: "Try again -- if your tensor shapes are misaligned, then you certainly won't be able to train, even on a single batch.",
175 | 		}
176 | 	]}
177 | />
178 | 
179 | ### 8. Why is it a good idea to include details on your compute environment with `transformers-cli env` when creating a new issue in the 🤗 Transformers repo?
180 | 
181 | <Question
182 | 	choices={[
183 | 		{
184 | 			text: "It allows the maintainers to understand which version of the library you're using.",
185 | 			explain: "Correct! Since each major version of the library may have changes in the API, knowing which specific version you are using can help narrow down the problem. What are the other benefits?",
186 | 			correct: true
187 | 		},
188 | 		{
189 | 			text: "It allows the maintainers to know whether you're running code on Windows, macOS, or Linux.",
190 | 			explain: "Correct! Errors can sometimes be caused by the specific operating system you are using, and knowing this helps the maintainers reproduce them locally. That's not the only reason, though.",
191 | 			correct: true
192 | 		},
193 | 		{
194 | 			text: "It allows the maintainers to know whether you're running code on a GPU or CPU.",
195 | 			explain: "Correct! As we've seen in this chapter, errors on GPUs and CPUs can quite different in flavor, and knowing which hardware you're using can help focus the maintainers' attention. But this isn't the only benefit...",
196 | 		}
197 | 	]}
198 | />


--------------------------------------------------------------------------------
/chapters/en/chapter1/10.mdx:
--------------------------------------------------------------------------------
  1 | <!-- DISABLE-FRONTMATTER-SECTIONS -->
  2 | 
  3 | # End-of-chapter quiz
  4 | 
  5 | This chapter covered a lot of ground! Don't worry if you didn't grasp all the details; the next chapters will help you understand how things work under the hood.
  6 | 
  7 | First, though, let's test what you learned in this chapter!
  8 | 
  9 | 
 10 | ### 1. Explore the Hub and look for the `roberta-large-mnli` checkpoint. What task does it perform?
 11 | 
 12 | 
 13 | <Question
 14 | 	choices={[
 15 | 		{
 16 | 			text: "Summarization",
 17 | 			explain: "Look again on the <a href=\"https://huggingface.co/roberta-large-mnli\">roberta-large-mnli page</a>."
 18 | 		},
 19 | 		{
 20 | 			text: "Text classification",
 21 | 			explain: "More precisely, it classifies if two sentences are logically linked across three labels (contradiction, neutral, entailment) — a task also called <em>natural language inference</em>.",
 22 | 			correct: true
 23 | 		},
 24 | 		{
 25 | 			text: "Text generation",
 26 | 			explain: "Look again on the <a href=\"https://huggingface.co/roberta-large-mnli\">roberta-large-mnli page</a>."
 27 | 		}
 28 | 	]}
 29 | />
 30 | 
 31 | ### 2. What will the following code return?
 32 | 
 33 | ```py
 34 | from transformers import pipeline
 35 | 
 36 | ner = pipeline("ner", grouped_entities=True)
 37 | ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")
 38 | ```
 39 | 
 40 | <Question
 41 | 	choices={[
 42 | 		{
 43 | 			text: "It will return classification scores for this sentence, with labels \"positive\" or \"negative\".",
 44 | 			explain: "This is incorrect — this would be a <code>sentiment-analysis</code> pipeline."
 45 | 		},
 46 | 		{
 47 | 			text: "It will return a generated text completing this sentence.",
 48 | 			explain: "This is incorrect — it would be a <code>text-generation</code> pipeline.",
 49 | 		},
 50 | 		{
 51 | 			text: "It will return the words representing persons, organizations or locations.",
 52 | 			explain: "Furthermore, with <code>grouped_entities=True</code>, it will group together the words belonging to the same entity, like \"Hugging Face\".",
 53 | 			correct: true
 54 | 		}
 55 | 	]}
 56 | />
 57 | 
 58 | ### 3. What should replace ... in this code sample?
 59 | 
 60 | ```py
 61 | from transformers import pipeline
 62 | 
 63 | filler = pipeline("fill-mask", model="bert-base-cased")
 64 | result = filler("...")
 65 | ```
 66 | 
 67 | <Question
 68 | 	choices={[
 69 | 		{
 70 | 			text: "This &#60;mask> has been waiting for you.",
 71 | 			explain: "This is incorrect. Check out the <code>bert-base-cased</code> model card and try to spot your mistake."
 72 | 		},
 73 | 		{
 74 | 			text: "This [MASK] has been waiting for you.",
 75 | 			explain: "Correct! This model's mask token is [MASK].",
 76 | 			correct: true
 77 | 		},
 78 | 		{
 79 | 			text: "This man has been waiting for you.",
 80 | 			explain: "This is incorrect. This pipeline fills in masked words, so it needs a mask token somewhere."
 81 | 		}
 82 | 	]}
 83 | />
 84 | 
 85 | ### 4. Why will this code fail?
 86 | 
 87 | ```py
 88 | from transformers import pipeline
 89 | 
 90 | classifier = pipeline("zero-shot-classification")
 91 | result = classifier("This is a course about the Transformers library")
 92 | ```
 93 | 
 94 | <Question
 95 | 	choices={[
 96 | 		{
 97 | 			text: "This pipeline requires that labels be given to classify this text.",
 98 | 			explain: "Right — the correct code needs to include <code>candidate_labels=[...]</code>.",
 99 | 			correct: true
100 | 		},
101 | 		{
102 | 			text: "This pipeline requires several sentences, not just one.",
103 | 			explain: "This is incorrect, though when properly used, this pipeline can take a list of sentences to process (like all other pipelines)."
104 | 		},
105 | 		{
106 | 			text: "The 🤗 Transformers library is broken, as usual.",
107 | 			explain: "We won't dignify this answer with a comment!"
108 | 		},
109 | 		{
110 | 			text: "This pipeline requires longer inputs; this one is too short.",
111 | 			explain: "This is incorrect. Note that a very long text will be truncated when processed by this pipeline."
112 | 		}
113 | 	]}
114 | />
115 | 
116 | ### 5. What does "transfer learning" mean?
117 | 
118 | <Question
119 | 	choices={[
120 | 		{
121 | 			text: "Transferring the knowledge of a pretrained model to a new model by training it on the same dataset.",
122 | 			explain: "No, that would be two versions of the same model."
123 | 		},
124 | 		{
125 | 			text: "Transferring the knowledge of a pretrained model to a new model by initializing the second model with the first model's weights.",
126 | 			explain: "Correct: when the second model is trained on a new task, it *transfers* the knowledge of the first model.",
127 | 			correct: true
128 | 		},
129 | 		{
130 | 			text: "Transferring the knowledge of a pretrained model to a new model by building the second model with the same architecture as the first model.",
131 | 			explain: "The architecture is just the way the model is built; there is no knowledge shared or transferred in this case."
132 | 		}
133 | 	]}
134 | />
135 | 
136 | ### 6. True or false? A language model usually does not need labels for its pretraining.
137 | 
138 | 
139 | <Question
140 | 	choices={[
141 | 		{
142 | 			text: "True",
143 | 			explain: "The pretraining is usually <em>self-supervised</em>, which means the labels are created automatically from the inputs (like predicting the next word or filling in some masked words).",
144 | 			correct: true
145 | 		},
146 | 		{
147 | 			text: "False",
148 | 			explain: "This is not the correct answer."
149 | 		}
150 | 	]}
151 | />
152 | 
153 | ### 7. Select the sentence that best describes the terms "model," "architecture," and "weights."
154 | 
155 | <Question
156 | 	choices={[
157 | 		{
158 | 			text: "If a model is a building, its architecture is the blueprint and the weights are the people living inside.",
159 | 			explain: "Following this metaphor, the weights would be the bricks and other materials used to construct the building."
160 | 		},
161 | 		{
162 | 			text: "An architecture is a map to build a model and its weights are the cities represented on the map.",
163 | 			explain: "The problem with this metaphor is that a map usually represents one existing reality (there is only one city in France named Paris). For a given architecture, multiple weights are possible."
164 | 		},
165 | 		{
166 | 			text: "An architecture is a succession of mathematical functions to build a model and its weights are those functions parameters.",
167 | 			explain: "The same set of mathematical functions (architecture) can be used to build different models by using different parameters (weights).",
168 | 			correct: true
169 | 		}
170 | 	]}
171 | />
172 | 
173 | 
174 | ### 8. Which of these types of models would you use for completing prompts with generated text?
175 | 
176 | <Question
177 | 	choices={[
178 | 		{
179 | 			text: "An encoder model",
180 | 			explain: "An encoder model generates a representation of the whole sentence that is better suited for tasks like classification."
181 | 		},
182 | 		{
183 | 			text: "A decoder model",
184 | 			explain: "Decoder models are perfectly suited for text generation from a prompt.",
185 | 			correct: true
186 | 		},
187 | 		{
188 | 			text: "A sequence-to-sequence model",
189 | 			explain: "Sequence-to-sequence models are better suited for tasks where you want to generate sentences in relation to the input sentences, not a given prompt."
190 | 		}
191 | 	]}
192 | />
193 | 
194 | ### 9. Which of those types of models would you use for summarizing texts?
195 | 
196 | <Question
197 | 	choices={[
198 | 		{
199 | 			text: "An encoder model",
200 | 			explain: "An encoder model generates a representation of the whole sentence that is better suited for tasks like classification."
201 | 		},
202 | 		{
203 | 			text: "A decoder model",
204 | 			explain: "Decoder models are good for generating output text (like summaries), but they don't have the ability to exploit a context like the whole text to summarize."
205 | 		},
206 | 		{
207 | 			text: "A sequence-to-sequence model",
208 | 			explain: "Sequence-to-sequence models are perfectly suited for a summarization task.",
209 | 			correct: true
210 | 		}
211 | 	]}
212 | />
213 | 
214 | ### 10. Which of these types of models would you use for classifying text inputs according to certain labels?
215 | 
216 | <Question
217 | 	choices={[
218 | 		{
219 | 			text: "An encoder model",
220 | 			explain: "An encoder model generates a representation of the whole sentence which is perfectly suited for a task like classification.",
221 | 			correct: true
222 | 		},
223 | 		{
224 | 			text: "A decoder model",
225 | 			explain: "Decoder models are good for generating output texts, not extracting a label out of a sentence."
226 | 		},
227 | 		{
228 | 			text: "A sequence-to-sequence model",
229 | 			explain: "Sequence-to-sequence models are better suited for tasks where you want to generate text based on an input sentence, not a label.",
230 | 		}
231 | 	]}
232 | />
233 | 
234 | ### 11. What possible source can the bias observed in a model have?
235 | 
236 | <Question
237 | 	choices={[
238 | 		{
239 | 			text: "The model is a fine-tuned version of a pretrained model and it picked up its bias from it.",
240 | 			explain: "When applying Transfer Learning, the bias in the pretrained model used perspires in the fine-tuned model.",
241 | 			correct: true
242 | 		},
243 | 		{
244 | 			text: "The data the model was trained on is biased.",
245 | 			explain: "This is the most obvious source of bias, but not the only one.",
246 | 			correct: true
247 | 		},
248 | 		{
249 | 			text: "The metric the model was optimizing for is biased.",
250 | 			explain: "A less obvious source of bias is the way the model is trained. Your model will blindly optimize for whatever metric you chose, without any second thoughts.",
251 | 			correct: true
252 | 		}
253 | 	]}
254 | />
255 | 


--------------------------------------------------------------------------------
/chapters/en/chapter5/8.mdx:
--------------------------------------------------------------------------------
  1 | <!-- DISABLE-FRONTMATTER-SECTIONS -->
  2 | 
  3 | # End-of-chapter quiz
  4 | 
  5 | This chapter covered a lot of ground! Don't worry if you didn't grasp all the details; the next chapters will help you understand how things work under the hood.
  6 | 
  7 | Before moving on, though, let's test what you learned in this chapter.
  8 | 
  9 | ### 1. The `load_dataset()` function in 🤗 Datasets allows you to load a dataset from which of the following locations? 
 10 | 
 11 | <Question
 12 | 	choices={[
 13 | 		{
 14 | 			text: "Locally, e.g. on your laptop",
 15 | 			explain: "Correct! You can pass the paths of local files to the <code>data_files</code> argument of <code>load_dataset()</code> to load local datasets.",
 16 | 			correct: true
 17 | 		},
 18 | 		{
 19 | 			text: "The Hugging Face Hub",
 20 | 			explain: "Correct! You can load datasets on the Hub by providing the dataset ID, e.g. <code>load_dataset('emotion')</code>.",
 21 | 			correct: true
 22 | 		},
 23 | 		{
 24 | 			text: "A remote server",
 25 | 			explain: "Correct! You can pass URLs to the <code>data_files</code> argument of <code>load_dataset()</code> to load remote files.",
 26 | 			correct: true
 27 | 		},
 28 | 	]}
 29 | />
 30 | 
 31 | ### 2. Suppose you load one of the GLUE tasks as follows:
 32 | 
 33 | ```py
 34 | from datasets import load_dataset
 35 | 
 36 | dataset = load_dataset("glue", "mrpc", split="train")
 37 | ```
 38 | 
 39 | Which of the following commands will produce a random sample of 50 elements from `dataset`?
 40 | 
 41 | <Question
 42 | 	choices={[
 43 | 		{
 44 | 			text: "<code>dataset.sample(50)</code>",
 45 | 			explain: "This is incorrect -- there is no <code>Dataset.sample()</code> method."
 46 | 		},
 47 | 		{
 48 | 			text: "<code>dataset.shuffle().select(range(50))</code>",
 49 | 			explain: "Correct! As you saw in this chapter, you first shuffle the dataset and then select the samples from it.",
 50 | 			correct: true
 51 | 		},
 52 | 		{
 53 | 			text: "<code>dataset.select(range(50)).shuffle()</code>",
 54 | 			explain: "This is incorrect -- although the code will run, it will only shuffle the first 50 elements in the dataset."
 55 | 		}
 56 | 	]}
 57 | />
 58 | 
 59 | ### 3. Suppose you have a dataset about household pets called `pets_dataset`, which has a `name` column that denotes the name of each pet. Which of the following approaches would allow you to filter the dataset for all pets whose names start with the letter "L"?
 60 | 
 61 | <Question
 62 | 	choices={[
 63 | 		{
 64 | 			text: "<code>pets_dataset.filter(lambda x : x['name'].startswith('L'))</code>",
 65 | 			explain: "Correct! Using a Python lambda function for these quick filters is a great idea. Can you think of another solution?",
 66 | 			correct: true
 67 | 		},
 68 | 		{
 69 | 			text: "<code>pets_dataset.filter(lambda x['name'].startswith('L'))</code>",
 70 | 			explain: "This is incorrect -- a lambda function takes the general form <code>lambda *arguments* : *expression*</code>, so you need to provide arguments in this case."
 71 | 		},
 72 | 		{
 73 | 			text: "Create a function like <code>def filter_names(x): return x['name'].startswith('L')</code> and run <code>pets_dataset.filter(filter_names)</code>.",
 74 | 			explain: "Correct! Just like with <code>Dataset.map()</code>, you can pass explicit functions to <code>Dataset.filter()</code>. This is useful when you have some complex logic that isn't suitable for a short lambda function. Which of the other solutions would work?",
 75 | 			correct: true
 76 | 		}
 77 | 	]}
 78 | />
 79 | 
 80 | ### 4. What is memory mapping?
 81 | 
 82 | <Question
 83 | 	choices={[
 84 | 		{
 85 | 			text: "A mapping between CPU and GPU RAM",
 86 | 			explain: "That's not it -- try again!",
 87 | 		},
 88 | 		{
 89 | 			text: "A mapping between RAM and filesystem storage",
 90 | 			explain: "Correct! 🤗 Datasets treats each dataset as a memory-mapped file. This allows the library to access and operate on elements of the dataset without needing to fully load it into memory.",
 91 | 			correct: true
 92 | 		},
 93 | 		{
 94 | 			text: "A mapping between two files in the 🤗 Datasets cache",
 95 | 			explain: "This is not correct - try again!"
 96 | 		}
 97 | 	]}
 98 | />
 99 | 
100 | ### 5. Which of the following are the main benefits of memory mapping?
101 | 
102 | <Question
103 | 	choices={[
104 | 		{
105 | 			text: "Accessing memory-mapped files is faster than reading from or writing to disk.",
106 | 			explain: "Correct! This allows 🤗 Datasets to be blazing fast. That's not the only benefit, though.",
107 | 			correct: true
108 | 		},
109 | 		{
110 | 			text: "Applications can access segments of data in an extremely large file without having to read the whole file into RAM first.",
111 | 			explain: "Correct! This allows 🤗 Datasets to load multi-gigabyte datasets on your laptop without blowing up your CPU. What other advantage does memory mapping offer?",
112 | 			correct: true
113 | 		},
114 | 		{
115 | 			text: "It consumes less energy, so your battery lasts longer.",
116 | 			explain: "This is not correct -- try again!"
117 | 		}
118 | 	]}
119 | />
120 | 
121 | ### 6. Why does the following code fail?
122 | 
123 | ```py
124 | from datasets import load_dataset
125 | 
126 | dataset = load_dataset("allocine", streaming=True, split="train")
127 | dataset[0]
128 | ```
129 | 
130 | <Question
131 | 	choices={[
132 | 		{
133 | 			text: "It tries to stream a dataset that's too large to fit in RAM.",
134 | 			explain: "This is not correct -- streaming datasets are decompressed on the fly, and you can process terabyte-sized datasets with very little RAM!",
135 | 		},
136 | 		{
137 | 			text: "It tries to access an <code>IterableDataset</code>.",
138 | 			explain: "Correct! An <code>IterableDataset</code> is a generator, not a container, so you should access its elements using <code>next(iter(dataset))</code>.",
139 | 			correct: true
140 | 		},
141 | 		{
142 | 			text: "The <code>allocine</code> dataset doesn't have a <code>train</code> split.",
143 | 			explain: "This is incorrect -- check out the [<code>allocine</code> dataset card](https://huggingface.co/datasets/allocine) on the Hub to see which splits it contains."
144 | 		}
145 | 	]}
146 | />
147 | 
148 | ### 7. Which of the following are the main benefits of creating a dataset card?
149 | 
150 | <Question
151 | 	choices={[
152 | 		{
153 | 			text: "It provides information about the intended use and supported tasks of the dataset so others in the community can make an informed decision about using it.",
154 | 			explain: "Correct! Undocumented datasets may be used to train models that may not reflect the intentions of the dataset creators, or may produce models whose legal status is murky if they're trained on data that violates privacy or licensing restrictions. This isn't the only benefit, though!",
155 | 			correct : true
156 | 		},
157 | 		{
158 | 			text: "It helps draw attention to the biases that are present in a corpus.",
159 | 			explain: "Correct! Almost all datasets have some form of bias, which can produce negative consequences downstream. Being aware of them helps model builders understand how to address the inherent biases. What else do dataset cards help with?",
160 | 			correct : true
161 | 		},
162 | 		{
163 | 			text: "It improves the chances that others in the community will use my dataset.",
164 | 			explain: "Correct! A well-written dataset card will tend to lead to higher usage of your precious dataset. What other benefits does it offer?",
165 | 			correct: true
166 | 		},
167 | 	]}
168 | />
169 | 
170 | 
171 | ### 8. What is semantic search?
172 | 
173 | <Question
174 | 	choices={[
175 | 		{
176 | 			text: "A way to search for exact matches between the words in a query and the documents in a corpus",
177 | 			explain: "This is incorrect -- this type of search is called *lexical search*, and it's what you typically see with traditional search engines."
178 | 		},
179 | 		{
180 | 			text: "A way to search for matching documents by understanding the contextual meaning of a query",
181 | 			explain: "Correct! Semantic search uses embedding vectors to represent queries and documents, and uses a similarity metric to measure the amount of overlap between them. How else might you describe it?",
182 | 			correct: true
183 | 		},
184 | 		{
185 | 			text: "A way to improve search accuracy",
186 | 			explain: "Correct! Semantic search engines can capture the intent of a query much better than keyword matching and typically retrieve documents with higher precision. But this isn't the only right answer - what else does semantic search provide?",
187 | 			correct: true
188 | 		}
189 | 	]}
190 | />
191 | 
192 | ### 9. For asymmetric semantic search, you usually have:
193 | 
194 | <Question
195 | 	choices={[
196 | 		{
197 | 			text: "A short query and a longer paragraph that answers the query",
198 | 			explain: "Correct!",
199 | 			correct : true
200 | 		},
201 | 		{
202 | 			text: "Queries and paragraphs that are of about the same length",
203 | 			explain: "This is actually an example of symmetric semantic search -- try again!"
204 | 		},
205 | 		{
206 | 			text: "A long query and a shorter paragraph that answers the query",
207 | 			explain: "This is incorrect -- try again!"
208 | 		}
209 | 	]}
210 | />
211 | 
212 | ### 10. Can I use 🤗 Datasets to load data for use in other domains, like speech processing?
213 | 
214 | <Question
215 | 	choices={[
216 | 		{
217 | 			text: "No",
218 | 			explain: "This is incorrect -- 🤗 Datasets currently supports tabular data, audio, and computer vision. Check out the <a  href='https://huggingface.co/datasets/mnist'>MNIST dataset</a> on the Hub for a computer vision example."
219 | 		},
220 | 		{
221 | 			text: "Yes",
222 | 			explain: "Correct! Check out the exciting developments with speech and vision in the 🤗 Transformers library to see how 🤗 Datasets is used in these domains.",
223 | 			correct : true
224 | 		},
225 | 	]}
226 | />
227 | 


--------------------------------------------------------------------------------
/chapters/en/chapter2/3.mdx:
--------------------------------------------------------------------------------
  1 | <FrameworkSwitchCourse {fw} />
  2 | 
  3 | # Models
  4 | 
  5 | {#if fw === 'pt'}
  6 | 
  7 | <DocNotebookDropdown
  8 |   classNames="absolute z-10 right-0 top-0"
  9 |   options={[
 10 |     {label: "Google Colab", value: "https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter2/section3_pt.ipynb"},
 11 |     {label: "Aws Studio", value: "https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/chapter2/section3_pt.ipynb"},
 12 | ]} />
 13 | 
 14 | {:else}
 15 | 
 16 | <DocNotebookDropdown
 17 |   classNames="absolute z-10 right-0 top-0"
 18 |   options={[
 19 |     {label: "Google Colab", value: "https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter2/section3_tf.ipynb"},
 20 |     {label: "Aws Studio", value: "https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/chapter2/section3_tf.ipynb"},
 21 | ]} />
 22 | 
 23 | {/if}
 24 | 
 25 | {#if fw === 'pt'}
 26 | <Youtube id="AhChOFRegn4"/>
 27 | {:else}
 28 | <Youtube id="d3JVgghSOew"/>
 29 | {/if}
 30 | 
 31 | {#if fw === 'pt'}
 32 | In this section we'll take a closer look at creating and using a model. We'll use the `AutoModel` class, which is handy when you want to instantiate any model from a checkpoint.
 33 | 
 34 | The `AutoModel` class and all of its relatives are actually simple wrappers over the wide variety of models available in the library. It's a clever wrapper as it can automatically guess the appropriate model architecture for your checkpoint, and then instantiates a model with this architecture.
 35 | 
 36 | {:else}
 37 | In this section we'll take a closer look at creating and using a model. We'll use the `TFAutoModel` class, which is handy when you want to instantiate any model from a checkpoint.
 38 | 
 39 | The `TFAutoModel` class and all of its relatives are actually simple wrappers over the wide variety of models available in the library. It's a clever wrapper as it can automatically guess the appropriate model architecture for your checkpoint, and then instantiates a model with this architecture.
 40 | 
 41 | {/if}
 42 | 
 43 | However, if you know the type of model you want to use, you can use the class that defines its architecture directly. Let's take a look at how this works with a BERT model.
 44 | 
 45 | ## Creating a Transformer
 46 | 
 47 | The first thing we'll need to do to initialize a BERT model is load a configuration object:
 48 | 
 49 | {#if fw === 'pt'}
 50 | ```py
 51 | from transformers import BertConfig, BertModel
 52 | 
 53 | # Building the config
 54 | config = BertConfig()
 55 | 
 56 | # Building the model from the config
 57 | model = BertModel(config)
 58 | ```
 59 | {:else}
 60 | ```py
 61 | from transformers import BertConfig, TFBertModel
 62 | 
 63 | # Building the config
 64 | config = BertConfig()
 65 | 
 66 | # Building the model from the config
 67 | model = TFBertModel(config)
 68 | ```
 69 | {/if}
 70 | 
 71 | The configuration contains many attributes that are used to build the model:
 72 | 
 73 | ```py
 74 | print(config)
 75 | ```
 76 | 
 77 | ```python out
 78 | BertConfig {
 79 |   [...]
 80 |   "hidden_size": 768,
 81 |   "intermediate_size": 3072,
 82 |   "max_position_embeddings": 512,
 83 |   "num_attention_heads": 12,
 84 |   "num_hidden_layers": 12,
 85 |   [...]
 86 | }
 87 | ```
 88 | 
 89 | While you haven't seen what all of these attributes do yet, you should recognize some of them: the `hidden_size` attribute defines the size of the `hidden_states` vector, and `num_hidden_layers` defines the number of layers the Transformer model has.
 90 | 
 91 | ### Different loading methods
 92 | 
 93 | Creating a model from the default configuration initializes it with random values:
 94 | 
 95 | {#if fw === 'pt'}
 96 | ```py
 97 | from transformers import BertConfig, BertModel
 98 | 
 99 | config = BertConfig()
100 | model = BertModel(config)
101 | 
102 | # Model is randomly initialized!
103 | ```
104 | {:else}
105 | ```py
106 | from transformers import BertConfig, TFBertModel
107 | 
108 | config = BertConfig()
109 | model = TFBertModel(config)
110 | 
111 | # Model is randomly initialized!
112 | ```
113 | {/if}
114 | 
115 | The model can be used in this state, but it will output gibberish; it needs to be trained first. We could train the model from scratch on the task at hand, but as you saw in [Chapter 1](/course/chapter1), this would require a long time and a lot of data, and it would have a non-negligible environmental impact. To avoid unnecessary and duplicated effort, it's imperative to be able to share and reuse models that have already been trained.
116 | 
117 | Loading a Transformer model that is already trained is simple — we can do this using the `from_pretrained()` method:
118 | 
119 | {#if fw === 'pt'}
120 | ```py
121 | from transformers import BertModel
122 | 
123 | model = BertModel.from_pretrained("bert-base-cased")
124 | ```
125 | 
126 | As you saw earlier, we could replace `BertModel` with the equivalent `AutoModel` class. We'll do this from now on as this produces checkpoint-agnostic code; if your code works for one checkpoint, it should work seamlessly with another. This applies even if the architecture is different, as long as the checkpoint was trained for a similar task (for example, a sentiment analysis task).
127 | 
128 | {:else}
129 | ```py
130 | from transformers import TFBertModel
131 | 
132 | model = TFBertModel.from_pretrained("bert-base-cased")
133 | ```
134 | 
135 | As you saw earlier, we could replace `TFBertModel` with the equivalent `TFAutoModel` class. We'll do this from now on as this produces checkpoint-agnostic code; if your code works for one checkpoint, it should work seamlessly with another. This applies even if the architecture is different, as long as the checkpoint was trained for a similar task (for example, a sentiment analysis task).
136 | 
137 | {/if}
138 | 
139 | In the code sample above we didn't use `BertConfig`, and instead loaded a pretrained model via the `bert-base-cased` identifier. This is a model checkpoint that was trained by the authors of BERT themselves; you can find more details about it in its [model card](https://huggingface.co/bert-base-cased).
140 | 
141 | This model is now initialized with all the weights of the checkpoint. It can be used directly for inference on the tasks it was trained on, and it can also be fine-tuned on a new task. By training with pretrained weights rather than from scratch, we can quickly achieve good results.
142 | 
143 | The weights have been downloaded and cached (so future calls to the `from_pretrained()` method won't re-download them) in the cache folder, which defaults to *~/.cache/huggingface/transformers*. You can customize your cache folder by setting the `HF_HOME` environment variable.
144 | 
145 | The identifier used to load the model can be the identifier of any model on the Model Hub, as long as it is compatible with the BERT architecture. The entire list of available BERT checkpoints can be found [here](https://huggingface.co/models?filter=bert).
146 | 
147 | ### Saving methods
148 | 
149 | Saving a model is as easy as loading one — we use the `save_pretrained()` method, which is analogous to the `from_pretrained()` method:
150 | 
151 | ```py
152 | model.save_pretrained("directory_on_my_computer")
153 | ```
154 | 
155 | This saves two files to your disk:
156 | 
157 | {#if fw === 'pt'}
158 | ```
159 | ls directory_on_my_computer
160 | 
161 | config.json pytorch_model.bin
162 | ```
163 | {:else}
164 | ```
165 | ls directory_on_my_computer
166 | 
167 | config.json tf_model.h5
168 | ```
169 | {/if}
170 | 
171 | If you take a look at the *config.json* file, you'll recognize the attributes necessary to build the model architecture. This file also contains some metadata, such as where the checkpoint originated and what 🤗 Transformers version you were using when you last saved the checkpoint.
172 | 
173 | {#if fw === 'pt'}
174 | The *pytorch_model.bin* file is known as the *state dictionary*; it contains all your model's weights. The two files go hand in hand; the configuration is necessary to know your model's architecture, while the model weights are your model's parameters.
175 | 
176 | {:else}
177 | The *tf_model.h5* file is known as the *state dictionary*; it contains all your model's weights. The two files go hand in hand; the configuration is necessary to know your model's architecture, while the model weights are your model's parameters.
178 | 
179 | {/if}
180 | 
181 | ## Using a Transformer model for inference
182 | 
183 | Now that you know how to load and save a model, let's try using it to make some predictions. Transformer models can only process numbers — numbers that the tokenizer generates. But before we discuss tokenizers, let's explore what inputs the model accepts.
184 | 
185 | Tokenizers can take care of casting the inputs to the appropriate framework's tensors, but to help you understand what's going on, we'll take a quick look at what must be done before sending the inputs to the model.
186 | 
187 | Let's say we have a couple of sequences:
188 | 
189 | ```py
190 | sequences = ["Hello!", "Cool.", "Nice!"]
191 | ```
192 | 
193 | The tokenizer converts these to vocabulary indices which are typically called *input IDs*. Each sequence is now a list of numbers! The resulting output is:
194 | 
195 | ```py no-format
196 | encoded_sequences = [
197 |     [101, 7592, 999, 102],
198 |     [101, 4658, 1012, 102],
199 |     [101, 3835, 999, 102],
200 | ]
201 | ```
202 | 
203 | This is a list of encoded sequences: a list of lists. Tensors only accept rectangular shapes (think matrices). This "array" is already of rectangular shape, so converting it to a tensor is easy:
204 | 
205 | {#if fw === 'pt'}
206 | ```py
207 | import torch
208 | 
209 | model_inputs = torch.tensor(encoded_sequences)
210 | ```
211 | {:else}
212 | ```py
213 | import tensorflow as tf
214 | 
215 | model_inputs = tf.constant(encoded_sequences)
216 | ```
217 | {/if}
218 | 
219 | ### Using the tensors as inputs to the model
220 | 
221 | Making use of the tensors with the model is extremely simple — we just call the model with the inputs:
222 | 
223 | ```py
224 | output = model(model_inputs)
225 | ```
226 | 
227 | While the model accepts a lot of different arguments, only the input IDs are necessary. We'll explain what the other arguments do and when they are required later, 
228 | but first we need to take a closer look at the tokenizers that build the inputs that a Transformer model can understand.
229 | 


--------------------------------------------------------------------------------
/chapters/en/chapter3/3.mdx:
--------------------------------------------------------------------------------
  1 | <FrameworkSwitchCourse {fw} />
  2 | 
  3 | # Fine-tuning a model with the Trainer API
  4 | 
  5 | <DocNotebookDropdown
  6 |   classNames="absolute z-10 right-0 top-0"
  7 |   options={[
  8 |     {label: "Google Colab", value: "https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter3/section3.ipynb"},
  9 |     {label: "Aws Studio", value: "https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/chapter3/section3.ipynb"},
 10 | ]} />
 11 | 
 12 | <Youtube id="nvBXf7s7vTI"/>
 13 | 
 14 | 🤗 Transformers provides a `Trainer` class to help you fine-tune any of the pretrained models it provides on your dataset. Once you've done all the data preprocessing work in the last section, you have just a few steps left to define the `Trainer`. The hardest part is likely to be preparing the environment to run `Trainer.train()`, as it will run very slowly on a CPU. If you don't have a GPU set up, you can get access to free GPUs or TPUs on [Google Colab](https://colab.research.google.com/).
 15 | 
 16 | The code examples below assume you have already executed the examples in the previous section. Here is a short summary recapping what you need:
 17 | 
 18 | ```py
 19 | from datasets import load_dataset
 20 | from transformers import AutoTokenizer, DataCollatorWithPadding
 21 | 
 22 | raw_datasets = load_dataset("glue", "mrpc")
 23 | checkpoint = "bert-base-uncased"
 24 | tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 25 | 
 26 | 
 27 | def tokenize_function(example):
 28 |     return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
 29 | 
 30 | 
 31 | tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
 32 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 33 | ```
 34 | 
 35 | ### Training
 36 | 
 37 | The first step before we can define our `Trainer` is to define a `TrainingArguments` class that will contain all the hyperparameters the `Trainer` will use for training and evaluation. The only argument you have to provide is a directory where the trained model will be saved, as well as the checkpoints along the way. For all the rest, you can leave the defaults, which should work pretty well for a basic fine-tuning.
 38 | 
 39 | ```py
 40 | from transformers import TrainingArguments
 41 | 
 42 | training_args = TrainingArguments("test-trainer")
 43 | ```
 44 | 
 45 | <Tip>
 46 | 
 47 | 💡 If you want to automatically upload your model to the Hub during training, pass along `push_to_hub=True` in the `TrainingArguments`. We will learn more about this in [Chapter 4](/course/chapter4/3)
 48 | 
 49 | </Tip>
 50 | 
 51 | The second step is to define our model. As in the [previous chapter](/course/chapter2), we will use the `AutoModelForSequenceClassification` class, with two labels:
 52 | 
 53 | ```py
 54 | from transformers import AutoModelForSequenceClassification
 55 | 
 56 | model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
 57 | ```
 58 | 
 59 | You will notice that unlike in [Chapter 2](/course/chapter2), you get a warning after instantiating this pretrained model. This is because BERT has not been pretrained on classifying pairs of sentences, so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been added instead. The warnings indicate that some weights were not used (the ones corresponding to the dropped pretraining head) and that some others were randomly initialized (the ones for the new head). It concludes by encouraging you to train the model, which is exactly what we are going to do now.
 60 | 
 61 | Once we have our model, we can define a `Trainer` by passing it all the objects constructed up to now — the `model`, the `training_args`, the training and validation datasets, our `data_collator`, and our `tokenizer`:
 62 | 
 63 | ```py
 64 | from transformers import Trainer
 65 | 
 66 | trainer = Trainer(
 67 |     model,
 68 |     training_args,
 69 |     train_dataset=tokenized_datasets["train"],
 70 |     eval_dataset=tokenized_datasets["validation"],
 71 |     data_collator=data_collator,
 72 |     tokenizer=tokenizer,
 73 | )
 74 | ```
 75 | 
 76 | Note that when you pass the `tokenizer` as we did here, the default `data_collator` used by the `Trainer` will be a `DataCollatorWithPadding` as defined previously, so you can skip the line `data_collator=data_collator` in this call. It was still important to show you this part of the processing in section 2!
 77 | 
 78 | To fine-tune the model on our dataset, we just have to call the `train()` method of our `Trainer`:
 79 | 
 80 | ```py
 81 | trainer.train()
 82 | ```
 83 | 
 84 | This will start the fine-tuning (which should take a couple of minutes on a GPU) and report the training loss every 500 steps. It won't, however, tell you how well (or badly) your model is performing. This is because:
 85 | 
 86 | 1. We didn't tell the `Trainer` to evaluate during training by setting `evaluation_strategy` to either `"steps"` (evaluate every `eval_steps`) or `"epoch"` (evaluate at the end of each epoch).
 87 | 2. We didn't provide the `Trainer` with a `compute_metrics()` function to calculate a metric during said evaluation (otherwise the evaluation would just have printed the loss, which is not a very intuitive number).
 88 | 
 89 | 
 90 | ### Evaluation
 91 | 
 92 | Let's see how we can build a useful `compute_metrics()` function and use it the next time we train. The function must take an `EvalPrediction` object (which is a named tuple with a `predictions` field and a `label_ids` field) and will return a dictionary mapping strings to floats (the strings being the names of the metrics returned, and the floats their values). To get some predictions from our model, we can use the `Trainer.predict()` command:
 93 | 
 94 | ```py
 95 | predictions = trainer.predict(tokenized_datasets["validation"])
 96 | print(predictions.predictions.shape, predictions.label_ids.shape)
 97 | ```
 98 | 
 99 | ```python out
100 | (408, 2) (408,)
101 | ```
102 | 
103 | The output of the `predict()` method is another named tuple with three fields: `predictions`, `label_ids`, and `metrics`. The `metrics` field will just contain the loss on the dataset passed, as well as some time metrics (how long it took to predict, in total and on average). Once we complete our `compute_metrics()` function and pass it to the `Trainer`, that field will also contain the metrics returned by `compute_metrics()`.
104 | 
105 | As you can see, `predictions` is a two-dimensional array with shape 408 x 2 (408 being the number of elements in the dataset we used). Those are the logits for each element of the dataset we passed to `predict()` (as you saw in the [previous chapter](/course/chapter2), all Transformer models return logits). To transform them into predictions that we can compare to our labels, we need to take the index with the maximum value on the second axis:
106 | 
107 | ```py
108 | import numpy as np
109 | 
110 | preds = np.argmax(predictions.predictions, axis=-1)
111 | ```
112 | 
113 | We can now compare those `preds` to the labels. To build our `compute_metric()` function, we will rely on the metrics from the 🤗 Datasets library. We can load the metrics associated with the MRPC dataset as easily as we loaded the dataset, this time with the `load_metric()` function. The object returned has a `compute()` method we can use to do the metric calculation:
114 | 
115 | ```py
116 | from datasets import load_metric
117 | 
118 | metric = load_metric("glue", "mrpc")
119 | metric.compute(predictions=preds, references=predictions.label_ids)
120 | ```
121 | 
122 | ```python out
123 | {'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}
124 | ```
125 | 
126 | The exact results you get may vary, as the random initialization of the model head might change the metrics it achieved. Here, we can see our model has an accuracy of 85.78% on the validation set and an F1 score of 89.97. Those are the two metrics used to evaluate results on the MRPC dataset for the GLUE benchmark. The table in the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf) reported an F1 score of 88.9 for the base model. That was the `uncased` model while we are currently using the `cased` model, which explains the better result.
127 | 
128 | Wrapping everything together, we get our `compute_metrics()` function:
129 | 
130 | ```py
131 | def compute_metrics(eval_preds):
132 |     metric = load_metric("glue", "mrpc")
133 |     logits, labels = eval_preds
134 |     predictions = np.argmax(logits, axis=-1)
135 |     return metric.compute(predictions=predictions, references=labels)
136 | ```
137 | 
138 | And to see it used in action to report metrics at the end of each epoch, here is how we define a new `Trainer` with this `compute_metrics()` function:
139 | 
140 | ```py
141 | training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
142 | model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
143 | 
144 | trainer = Trainer(
145 |     model,
146 |     training_args,
147 |     train_dataset=tokenized_datasets["train"],
148 |     eval_dataset=tokenized_datasets["validation"],
149 |     data_collator=data_collator,
150 |     tokenizer=tokenizer,
151 |     compute_metrics=compute_metrics,
152 | )
153 | ```
154 | 
155 | Note that we create a new `TrainingArguments` with its `evaluation_strategy` set to `"epoch"` and a new model — otherwise, we would just be continuing the training of the model we have already trained. To launch a new training run, we execute:
156 | 
157 | ```
158 | trainer.train()
159 | ```
160 | 
161 | This time, it will report the validation loss and metrics at the end of each epoch on top of the training loss. Again, the exact accuracy/F1 score you reach might be a bit different from what we found, because of the random head initialization of the model, but it should be in the same ballpark.
162 | 
163 | The `Trainer` will work out of the box on multiple GPUs or TPUs and provides lots of options, like mixed-precision training (use `fp16 = True` in your training arguments). We will go over everything it supports in Chapter 10.
164 | 
165 | This concludes the introduction to fine-tuning using the `Trainer` API. An example of doing this for most common NLP tasks will be given in Chapter 7, but for now let's look at how to do the same thing in pure PyTorch.
166 | 
167 | <Tip>
168 | 
169 | ✏️ **Try it out!** Fine-tune a model on the GLUE SST-2 dataset, using the data processing you did in section 2.
170 | 
171 | </Tip>
172 | 
173 | 


--------------------------------------------------------------------------------
/chapters/en/event/1.mdx:
--------------------------------------------------------------------------------
  1 | # Part 2 Release Event
  2 | 
  3 | For the release of part 2 of the course, we organized a live event with two days of talks before a fine-tuning sprint. If you missed it, you can catch up with the talks which are all listed below!
  4 | 
  5 | ## Day 1: A high-level view of Transformers and how to train them
  6 | 
  7 | **Thomas Wolf:** *Transfer Learning and the birth of the Transformers library*
  8 | 
  9 | <div class="flex justify-center">
 10 | <Youtube id="wCYVeahJES0"/>
 11 | </div>
 12 | 
 13 | <p align="center">
 14 | <img src="https://i.imgur.com/9eq8oUi.png" alt="A visual summary of Thom's talk" width="80%"/>
 15 | </p>
 16 | 
 17 | Thomas Wolf is co-founder and Chief Science Officer of Hugging Face. The tools created by Thomas Wolf and the Hugging Face team are used across more than 5,000 research organisations including Facebook Artificial Intelligence Research, Google Research, DeepMind, Amazon Research, Apple, the Allen Institute for Artificial Intelligence as well as most university departments. Thomas Wolf is the initiator and senior chair of the largest research collaboration that has ever existed in Artificial Intelligence: [“BigScience”](https://bigscience.huggingface.co), as well as a set of widely used [libraries and tools](https://github.com/huggingface/). Thomas Wolf is also a prolific educator, a thought leader in the field of Artificial Intelligence and Natural Language Processing, and a regular invited speaker to conferences all around the world [https://thomwolf.io](https://thomwolf.io).
 18 | 
 19 | **Jay Alammar:** *A gentle visual intro to Transformers models*
 20 | 
 21 | <div class="flex justify-center">
 22 | <Youtube id="VzvG23gmcYU"/>
 23 | </div>
 24 | 
 25 | <p align="center">
 26 | <img src="https://i.imgur.com/rOZAuE9.png" alt="A visual summary of Jay's talk" width="80%"/>
 27 | </p>
 28 | 
 29 | Through his popular ML blog, Jay has helped millions of researchers and engineers visually understand machine learning tools and concepts from the basic (ending up in NumPy, Pandas docs) to the cutting-edge (Transformers, BERT, GPT-3).
 30 | 
 31 | **Margaret Mitchell:** *On Values in ML Development*
 32 | 
 33 | <div class="flex justify-center">
 34 | <Youtube id="8j9HRMjh_s8"/>
 35 | </div>
 36 | 
 37 | <p align="center">
 38 | <img src="https://i.imgur.com/NuIsnY3.png" alt="A visual summary of Margaret's talk" width="80%"/>
 39 | </p>
 40 | 
 41 | Margaret Mitchell is a researcher working on Ethical AI, currently focused on the ins and outs of ethics-informed AI development in tech. She has published over 50 papers on natural language generation, assistive technology, computer vision, and AI ethics, and holds multiple patents in the areas of conversation generation and sentiment classification. She previously worked at Google AI as a Staff Research Scientist, where she founded and co-led Google&#39;s Ethical AI group, focused on foundational AI ethics research and operationalizing AI ethics Google-internally. Before joining Google, she was a researcher at Microsoft Research, focused on computer vision-to-language generation; and was a postdoc at Johns Hopkins, focused on Bayesian modeling and information extraction. She holds a PhD in Computer Science from the University of Aberdeen and a Master&#39;s in computational linguistics from the University of Washington. While earning her degrees, she also worked from 2005-2012 on machine learning, neurological disorders, and assistive technology at Oregon Health and Science University. She has spearheaded a number of workshops and initiatives at the intersections of diversity, inclusion, computer science, and ethics. Her work has received awards from Secretary of Defense Ash Carter and the American Foundation for the Blind, and has been implemented by multiple technology companies. She likes gardening, dogs, and cats.
 42 | 
 43 | **Matthew Watson and Chen Qian:** *NLP workflows with Keras*
 44 | 
 45 | <div class="flex justify-center">
 46 | <Youtube id="gZIP-_2XYMM"/>
 47 | </div>
 48 | 
 49 | <p align="center">
 50 | <img src="https://i.imgur.com/1vD2az8.png" alt="A visual summary of Matt and Chen's talk" width="80%"/>
 51 | </p>
 52 | 
 53 | Matthew Watson is a machine learning engineer on the Keras team, with a focus on high-level modeling APIs. He studied Computer Graphics during undergrad and a Masters at Stanford University. An almost English major who turned towards computer science, he is passionate about working across disciplines and making NLP accessible to a wider audience.
 54 | 
 55 | Chen Qian is a software engineer from Keras team, with a focus on high-level modeling APIs. Chen got a Master degree of Electrical Engineering from Stanford University, and he is especially interested in simplifying code implementations of ML tasks and large-scale ML.
 56 | 
 57 | **Mark Saroufim:** *How to Train a Model with Pytorch*
 58 | 
 59 | <div class="flex justify-center">
 60 | <Youtube id="KmvPlW2cbIo"/>
 61 | </div>
 62 | 
 63 | <p align="center">
 64 | <img src="https://i.imgur.com/TPmlkm8.png" alt="A visual summary of Mark's talk" width="80%"/>
 65 | </p>
 66 | 
 67 | Mark Saroufim is a Partner Engineer at Pytorch working on OSS production tools including TorchServe and Pytorch Enterprise. In his past lives, Mark was an Applied Scientist and Product Manager at Graphcore, [yuri.ai](http://yuri.ai/), Microsoft and NASA's JPL. His primary passion is to make programming more fun.
 68 | 
 69 | **Jakob Uszkoreit:** *It Ain't Broke So <del>Don't Fix</del> Let's Break It*
 70 | 
 71 | <div class="flex justify-center">
 72 | <Youtube id="C6jweXYFHSA"/>
 73 | </div>
 74 | 
 75 | <p align="center">
 76 | <img src="https://i.imgur.com/5dWQeNB.png" alt="A visual summary of Jakob's talk" width="80%"/>
 77 | </p>
 78 | 
 79 | Jakob Uszkoreit is the co-founder of Inceptive. Inceptive designs RNA molecules for vaccines and therapeutics using large-scale deep learning in a tight loop with high throughput experiments with the goal of making RNA-based medicines more accessible, more effective and more broadly applicable. Previously, Jakob worked at Google for more than a decade, leading research and development teams in Google Brain, Research and Search working on deep learning fundamentals, computer vision, language understanding and machine translation.
 80 | 
 81 | ## Day 2: The tools to use
 82 | 
 83 | **Lewis Tunstall:** *Simple Training with the 🤗 Transformers Trainer*
 84 | 
 85 | <div class="flex justify-center">
 86 | <Youtube id="u--UVvH-LIQ"/>
 87 | </div>
 88 | 
 89 | Lewis is a machine learning engineer at Hugging Face, focused on developing open-source tools and making them accessible to the wider community. He is also a co-author of an upcoming O’Reilly book on Transformers and you can follow him on Twitter (@_lewtun) for NLP tips and tricks!
 90 | 
 91 | **Matthew Carrigan:** *New TensorFlow Features for 🤗 Transformers and 🤗 Datasets*
 92 | 
 93 | <div class="flex justify-center">
 94 | <Youtube id="gQUlXp1691w"/>
 95 | </div>
 96 | 
 97 | Matt is responsible for TensorFlow maintenance at Transformers, and will eventually lead a coup against the incumbent PyTorch faction which will likely be co-ordinated via his Twitter account @carrigmat.
 98 | 
 99 | **Lysandre Debut:** *The Hugging Face Hub as a means to collaborate on and share Machine Learning projects*
100 | 
101 | <div class="flex justify-center">
102 | <Youtube id="RBw1TmdEZp0"/>
103 | </div>
104 | 
105 | <p align="center">
106 | <img src="https://i.imgur.com/TarIPCz.png" alt="A visual summary of Lysandre's talk" width="80%"/>
107 | </p>
108 | 
109 | Lysandre is a Machine Learning Engineer at Hugging Face where he is involved in many open source projects. His aim is to make Machine Learning accessible to everyone by developing powerful tools with a very simple API.
110 | 
111 | **Lucile Saulnier:** *Get your own tokenizer with 🤗 Transformers & 🤗 Tokenizers*
112 | 
113 | <div class="flex justify-center">
114 | <Youtube id="UkNmyTFKriI"/>
115 | </div>
116 | 
117 | Lucile is a machine learning engineer at Hugging Face, developing and supporting the use of open source tools. She is also actively involved in many research projects in the field of Natural Language Processing such as collaborative training and BigScience.
118 | 
119 | **Sylvain Gugger:** *Supercharge your PyTorch training loop with 🤗 Accelerate*
120 | 
121 | <div class="flex justify-center">
122 | <Youtube id="t8Krzu-nSeY"/>
123 | </div>
124 | 
125 | Sylvain is a Research Engineer at Hugging Face and one of the core maintainers of 🤗 Transformers and the developer behind 🤗 Accelerate. He likes making model training more accessible.
126 | 
127 | **Merve Noyan:** *Showcase your model demos with 🤗 Spaces*
128 | 
129 | <div class="flex justify-center">
130 | <Youtube id="vbaKOa4UXoM"/>
131 | </div>
132 | 
133 | Merve is a developer advocate at Hugging Face, working on developing tools and building content around them to democratize machine learning for everyone.
134 | 
135 | **Abubakar Abid:** *Building Machine Learning Applications Fast*
136 | 
137 | <div class="flex justify-center">
138 | <Youtube id="c7mle2yYpwQ"/>
139 | </div>
140 | 
141 | <p align="center">
142 | <img src="https://i.imgur.com/qWIFeiF.png" alt="A visual summary of Abubakar's talk" width="80%"/>
143 | </p>
144 | 
145 | Abubakar Abid is the CEO of [Gradio](www.gradio.app). He received his Bachelor's of Science in Electrical Engineering and Computer Science from MIT in 2015, and his PhD in Applied Machine Learning from Stanford in 2021. In his role as the CEO of Gradio, Abubakar works on making machine learning models easier to demo, debug, and deploy.
146 | 
147 | **Mathieu Desvé:** *AWS ML Vision: Making Machine Learning Accessible to all Customers*
148 | 
149 | <div class="flex justify-center">
150 | <Youtube id="O2e3pXO4aRE"/>
151 | </div>
152 | 
153 | <p align="center">
154 | <img src="https://i.imgur.com/oLdZTKy.png" alt="A visual summary of Mathieu's talk" width="80%"/>
155 | </p>
156 | 
157 | Technology enthusiast, maker on my free time. I like challenges and solving problem of clients and users, and work with talented people to learn every day. Since 2004, I work in multiple positions switching from frontend, backend, infrastructure, operations and managements. Try to solve commons technical and managerial issues in agile manner.
158 | 
159 | **Philipp Schmid:** *Managed Training with Amazon SageMaker and 🤗 Transformers*
160 | 
161 | <div class="flex justify-center">
162 | <Youtube id="yG6J2Zfo8iw"/>
163 | </div>
164 | 
165 | Philipp Schmid is a Machine Learning Engineer and Tech Lead at Hugging Face, where he leads the collaboration with the Amazon SageMaker team. He is passionate about democratizing and productionizing cutting-edge NLP models and improving the ease of use for Deep Learning.


--------------------------------------------------------------------------------
/utils/generate_notebooks.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import re
  4 | import nbformat
  5 | import shutil
  6 | import yaml
  7 | 
  8 | from pathlib import Path
  9 | 
 10 | PATH_TO_COURSE = "chapters/en/"
 11 | 
 12 | re_framework_test = re.compile(r"^{#if\s+fw\s+===\s+'([^']+)'}\s*$")
 13 | re_framework_else = re.compile(r"^{:else}\s*$")
 14 | re_framework_end = re.compile(r"^{/if}\s*$")
 15 | 
 16 | re_html_line = re.compile(r"^<[^>]*/>\s*$")
 17 | re_html_tag = re.compile(r"<([^/>]*)>\s*$")
 18 | 
 19 | re_python_code = re.compile(r"^```(?:py|python|py no\-format|python no\-format)\s*$")
 20 | re_output_code = re.compile(r"^```(?:py|python)\s+out\s*$")
 21 | re_end_code = re.compile(r"^```\s*$")
 22 | 
 23 | frameworks = {"pt": "PyTorch", "tf": "TensorFlow"}
 24 | 
 25 | def read_and_split_frameworks(fname):
 26 |     """
 27 |     Read the MDX in fname and creates two versions (if necessary) for each framework.
 28 |     """
 29 |     with open(fname, "r") as f:
 30 |         content = f.readlines()
 31 |     
 32 |     contents = {"pt": [], "tf": []}
 33 |     
 34 |     differences = False
 35 |     current_content = []
 36 |     line_idx = 0
 37 |     for line in content:
 38 |         if re_framework_test.search(line) is not None:
 39 |             differences = True
 40 |             framework = re_framework_test.search(line).groups()[0]
 41 |             for key in contents:
 42 |                 contents[key].extend(current_content)
 43 |             current_content = []
 44 |         elif re_framework_else.search(line) is not None:
 45 |             contents[framework].extend(current_content)
 46 |             current_content = []
 47 |             framework = "pt" if framework == "tf" else "tf"
 48 |         elif re_framework_end.search(line) is not None:
 49 |             contents[framework].extend(current_content)
 50 |             current_content = []
 51 |         else:
 52 |             current_content.append(line)
 53 | 
 54 |     if len(current_content) > 0:
 55 |         for key in contents:
 56 |             contents[key].extend(current_content)
 57 |     
 58 |     if differences:
 59 |         return {k: "".join(content) for k, content in contents.items()}
 60 |     else:
 61 |         return "".join(content)
 62 | 
 63 | 
 64 | def extract_cells(content):
 65 |     """
 66 |     Extract the code/output cells from content.
 67 |     """
 68 |     cells = []
 69 |     current_cell = None
 70 |     is_output = False
 71 |     for line in content.split("\n"):
 72 |         if re_python_code.search(line) is not None:
 73 |             is_output = False
 74 |             current_cell = []
 75 |         elif re_output_code.search(line) is not None:
 76 |             is_output = True
 77 |             current_cell = []
 78 |         elif re_end_code.search(line) is not None and current_cell is not None:
 79 |             cell = "\n".join(current_cell)
 80 |             if is_output:
 81 |                 if not isinstance(cells[-1], tuple):
 82 |                     cells[-1] = (cells[-1], cell)
 83 |             else:
 84 |                 cells.append(cell)
 85 |             current_cell = None
 86 |             current_md = []
 87 |         elif current_cell is not None:
 88 |             current_cell.append(line)
 89 | 
 90 |     return cells
 91 | 
 92 | 
 93 | def convert_to_nb_cell(cell):
 94 |     """
 95 |     Convert some cell (either just code or tuple (code, output)) to a proper notebook cell.
 96 |     """
 97 |     nb_cell = {"cell_type": "code", "execution_count": None, "metadata": {}}
 98 |     if isinstance(cell, tuple):
 99 |         nb_cell["source"] = cell[0]
100 |         nb_cell["outputs"] = [nbformat.notebooknode.NotebookNode({
101 |             'data': {'text/plain': cell[1]},
102 |             'execution_count': None,
103 |             'metadata': {},
104 |             'output_type': 'execute_result',
105 |         })]
106 |     else:
107 |         nb_cell["source"] = cell
108 |         nb_cell["outputs"] = []
109 |     return nbformat.notebooknode.NotebookNode(nb_cell)
110 | 
111 | 
112 | def nb_cell(source, code=True):
113 |     if not code:
114 |         return nbformat.notebooknode.NotebookNode(
115 |             {"cell_type": "markdown", "source": source, "metadata": {}}
116 |         )
117 |     return nbformat.notebooknode.NotebookNode(
118 |         {"cell_type": "code", "metadata": {}, "source": source, "execution_count": None, "outputs": []}
119 |     )
120 | 
121 | 
122 | def build_notebook(fname, title, output_dir="."):
123 |     """
124 |     Build the notebook for fname with a given title in output_dir.
125 |     """
126 |     sections = read_and_split_frameworks(fname)
127 |     sections_with_accelerate = [
128 |         "A full training",
129 |         "Token classification (PyTorch)",
130 |         "Fine-tuning a masked language model (PyTorch)",
131 |         "Translation (PyTorch)",
132 |         "Summarization (PyTorch)",
133 |         "Training a causal language model from scratch (PyTorch)",
134 |         "Question answering (PyTorch)",
135 |     ]
136 |     sections_with_hf_hub = [
137 |         "Sharing pretrained models (PyTorch)",
138 |         "Sharing pretrained models (TensorFlow)",
139 |         "Creating your own dataset",
140 |         "Token classification (PyTorch)",
141 |         "Token classification (TensorFlow)",
142 |         "Training a new tokenizer from an old one",
143 |         "Fine-tuning a masked language model (PyTorch)",
144 |         "Fine-tuning a masked language model (TensorFlow)",
145 |         "Translation (PyTorch)",
146 |         "Translation (TensorFlow)",
147 |         "Summarization (PyTorch)",
148 |         "Summarization (TensorFlow)",
149 |         "Training a causal language model from scratch (PyTorch)",
150 |         "Training a causal language model from scratch (TensorFlow)",
151 |         "Question answering (PyTorch)",
152 |         "Question answering (TensorFlow)",
153 |         "What to do when you get an error",
154 |     ]
155 |     sections_with_faiss = ["Semantic search with FAISS (PyTorch)", "Semantic search with FAISS (TensorFlow)"]
156 |     stem = Path(fname).stem
157 |     if not isinstance(sections, dict):
158 |         contents = [sections]
159 |         titles = [title]
160 |         fnames = [f"{stem}.ipynb"]
161 |     else:
162 |         contents = []
163 |         titles = []
164 |         fnames = []
165 |         for key, section in sections.items():
166 |             contents.append(section)
167 |             titles.append(f"{title} ({frameworks[key]})")
168 |             fnames.append(f"{stem}_{key}.ipynb")
169 |     
170 |     for title, content, fname in zip(titles, contents, fnames):
171 |         cells = extract_cells(content)
172 |         if len(cells) == 0:
173 |             continue
174 |         
175 |         nb_cells = [
176 |             nb_cell(f"# {title}", code=False),
177 |             nb_cell("Install the Transformers and Datasets libraries to run this notebook.", code=False)
178 |         ]
179 | 
180 |         # Install cell
181 |         installs = ["!pip install datasets transformers[sentencepiece]"]
182 |         if title in sections_with_accelerate:
183 |             installs.append("!pip install accelerate")
184 |             installs.append("# To run the training on TPU, you will need to uncomment the followin line:")
185 |             installs.append("# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl")
186 |         if title in sections_with_hf_hub:
187 |             installs.append("!apt install git-lfs")
188 |         if title in sections_with_faiss:
189 |             installs.append("!pip install faiss-gpu")
190 |         
191 |         nb_cells.append(nb_cell("\n".join(installs)))
192 | 
193 |         if title in sections_with_hf_hub:
194 |             nb_cells.extend([
195 |                 nb_cell("You will need to setup git, adapt your email and name in the following cell.", code=False),
196 |                 nb_cell("!git config --global user.email \"you@example.com\"\n!git config --global user.name \"Your Name\""),
197 |                 nb_cell("You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.", code=False),
198 |                 nb_cell("from huggingface_hub import notebook_login\n\nnotebook_login()"),
199 |             ])
200 |         nb_cells += [convert_to_nb_cell(cell) for cell in cells]
201 |         metadata = {"colab": {"name": title, "provenance": []}}
202 |         nb_dict = {"cells": nb_cells, "metadata": metadata, "nbformat": 4, "nbformat_minor": 4}
203 |         notebook = nbformat.notebooknode.NotebookNode(nb_dict)
204 |         os.makedirs(output_dir, exist_ok=True)
205 |         nbformat.write(notebook, os.path.join(output_dir, fname), version=4)
206 | 
207 | 
208 | def get_titles():
209 |     """
210 |     Parse the yaml _chapters.yml to get the correspondence filename to title
211 |     """
212 |     table = yaml.safe_load(open(os.path.join(PATH_TO_COURSE, "_chapters.yml"), "r"))
213 |     result = {}
214 |     for entry in table:
215 |         chapter_name = entry["local"]
216 |         sections = []
217 |         for i, section in enumerate(entry["sections"]):
218 |             if isinstance(section, str):
219 |                 result[os.path.join(chapter_name, f"section{i+1}")] = section
220 |             else:
221 |                 section_name = section["local"]
222 |                 section_title = section["title"]
223 |                 if isinstance(section_name, str):
224 |                     result[os.path.join(chapter_name, section_name)] = section_title
225 |                 else:
226 |                     if isinstance(section_title, str):
227 |                         section_title = {key: section_title for key in section_name.keys()}
228 |                     for key in section_name.keys():
229 |                         result[os.path.join(chapter_name, section_name[key])] = section_title[key]
230 |     return {k: v for k, v in result.items() if "quiz" not in v}
231 | 
232 | 
233 | def create_notebooks(output_dir):
234 |     for folder in os.listdir(output_dir):
235 |         if folder.startswith("chapter"):
236 |             shutil.rmtree(os.path.join(output_dir, folder))
237 |     titles = get_titles()
238 |     for fname, title in titles.items():
239 |         build_notebook(
240 |             os.path.join(PATH_TO_COURSE, f"{fname}.mdx"),
241 |             title,
242 |             os.path.join(output_dir, Path(fname).parent),
243 |         )
244 | 
245 | 
246 | if __name__ == "__main__":
247 |     parser = argparse.ArgumentParser()
248 |     parser.add_argument("--output_dir", type=str, help="Where to output the notebooks")
249 |     args = parser.parse_args()
250 | 
251 |     create_notebooks(args.output_dir)
252 | 


--------------------------------------------------------------------------------
/chapters/en/chapter3/3_tf.mdx:
--------------------------------------------------------------------------------
  1 | <FrameworkSwitchCourse {fw} />
  2 | 
  3 | # Fine-tuning a model with Keras
  4 | 
  5 | <DocNotebookDropdown
  6 |   classNames="absolute z-10 right-0 top-0"
  7 |   options={[
  8 |     {label: "Google Colab", value: "https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter3/section3_tf.ipynb"},
  9 |     {label: "Aws Studio", value: "https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/chapter3/section3_tf.ipynb"},
 10 | ]} />
 11 | 
 12 | Once you've done all the data preprocessing work in the last section, you have just a few steps left to train the model. Note, however, that the `model.fit()` command will run very slowly on a CPU. If you don't have a GPU set up, you can get access to free GPUs or TPUs on [Google Colab](https://colab.research.google.com/).
 13 | 
 14 | The code examples below assume you have already executed the examples in the previous section. Here is a short summary recapping what you need:
 15 | 
 16 | ```py
 17 | from datasets import load_dataset
 18 | from transformers import AutoTokenizer, DataCollatorWithPadding
 19 | import numpy as np
 20 | 
 21 | raw_datasets = load_dataset("glue", "mrpc")
 22 | checkpoint = "bert-base-uncased"
 23 | tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 24 | 
 25 | 
 26 | def tokenize_function(example):
 27 |     return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
 28 | 
 29 | 
 30 | tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
 31 | 
 32 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
 33 | 
 34 | tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
 35 |     columns=["attention_mask", "input_ids", "token_type_ids"],
 36 |     label_cols=["labels"],
 37 |     shuffle=True,
 38 |     collate_fn=data_collator,
 39 |     batch_size=8,
 40 | )
 41 | 
 42 | tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset(
 43 |     columns=["attention_mask", "input_ids", "token_type_ids"],
 44 |     label_cols=["labels"],
 45 |     shuffle=False,
 46 |     collate_fn=data_collator,
 47 |     batch_size=8,
 48 | )
 49 | ```
 50 | 
 51 | ### Training
 52 | 
 53 | TensorFlow models imported from 🤗 Transformers are already Keras models. Here is a short introduction to Keras.
 54 | 
 55 | <Youtube id="rnTGBy2ax1c"/>
 56 | 
 57 | That means that once we have our data, very little work is required to begin training on it.
 58 | 
 59 | <Youtube id="AUozVp78dhk"/>
 60 | 
 61 | As in the [previous chapter](/course/chapter2), we will use the `TFAutoModelForSequenceClassification` class, with two labels: 
 62 | 
 63 | ```py
 64 | from transformers import TFAutoModelForSequenceClassification
 65 | 
 66 | model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
 67 | ```
 68 | 
 69 | You will notice that unlike in [Chapter 2](/course/chapter2), you get a warning after instantiating this pretrained model. This is because BERT has not been pretrained on classifying pairs of sentences, so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been inserted instead. The warnings indicate that some weights were not used (the ones corresponding to the dropped pretraining head) and that some others were randomly initialized (the ones for the new head). It concludes by encouraging you to train the model, which is exactly what we are going to do now.
 70 | 
 71 | To fine-tune the model on our dataset, we just have to `compile()` our model and then pass our data to the `fit()` method. This will start the fine-tuning process (which should take a couple of minutes on a GPU) and report training loss as it goes, plus the validation loss at the end of each epoch.
 72 | 
 73 | <Tip>
 74 | 
 75 | Note that 🤗 Transformers models have a special ability that most Keras models don't - they can automatically use an appropriate loss which they compute internally. They will use this loss by default if you don't set a loss argument in `compile()`. Note that to use the internal loss you'll need to pass your labels as part of the input, not as a separate label, which is the normal way to use labels with Keras models. You'll see examples of this in Part 2 of the course, where defining the correct loss function can be tricky. For sequence classification, however, a standard Keras loss function works fine, so that's what we'll use here.
 76 | 
 77 | </Tip>
 78 | 
 79 | ```py
 80 | from tensorflow.keras.losses import SparseCategoricalCrossentropy
 81 | 
 82 | model.compile(
 83 |     optimizer="adam",
 84 |     loss=SparseCategoricalCrossentropy(from_logits=True),
 85 |     metrics=["accuracy"],
 86 | )
 87 | model.fit(
 88 |     tf_train_dataset,
 89 |     validation_data=tf_validation_dataset,
 90 | )
 91 | ```
 92 | 
 93 | <Tip warning={true}>
 94 | 
 95 | Note a very common pitfall here — you *can* just pass the name of the loss as a string to Keras, but by default Keras will assume that you have already applied a softmax to your outputs. Many models, however, output the values right before the softmax is applied, which are also known as the *logits*. We need to tell the loss function that that's what our model does, and the only way to do that is to call it directly, rather than by name with a string.
 96 | 
 97 | </Tip>
 98 | 
 99 | 
100 | ### Improving training performance
101 | 
102 | <Youtube id="cpzq6ESSM5c"/>
103 | 
104 | If you try the above code, it certainly runs, but you'll find that the loss declines only slowly or sporadically. The primary cause
105 | is the *learning rate*. As with the loss, when we pass Keras the name of an optimizer as a string, Keras initializes
106 | that optimizer with default values for all parameters, including learning rate. From long experience, though, we know
107 | that transformer models benefit from a much lower learning rate than the default for Adam, which is 1e-3, also written
108 | as 10 to the power of -3, or 0.001. 5e-5 (0.00005), which is some twenty times lower, is a much better starting point.
109 | 
110 | In addition to lowering the learning rate, we have a second trick up our sleeve: We can slowly reduce the learning rate
111 | over the course of training. In the literature, you will sometimes see this referred to as *decaying* or *annealing*
112 | the learning rate. In Keras, the best way to do this is to use a *learning rate scheduler*. A good one to use is
113 | `PolynomialDecay` — despite the name, with default settings it simply linearly decays the learning rate from the initial
114 | value to the final value over the course of training, which is exactly what we want. In order to use a scheduler correctly,
115 | though, we need to tell it how long training is going to be. We compute that as `num_train_steps` below.
116 | 
117 | ```py
118 | from tensorflow.keras.optimizers.schedules import PolynomialDecay
119 | 
120 | batch_size = 8
121 | num_epochs = 3
122 | # The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
123 | # by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
124 | # not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
125 | num_train_steps = len(tf_train_dataset) * num_epochs
126 | lr_scheduler = PolynomialDecay(
127 |     initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
128 | )
129 | from tensorflow.keras.optimizers import Adam
130 | 
131 | opt = Adam(learning_rate=lr_scheduler)
132 | ```
133 | 
134 | <Tip>
135 | 
136 | The 🤗 Transformers library also has a `create_optimizer()` function that will create an `AdamW` optimizer with learning rate decay. This is a convenient shortcut that you'll see in detail in future sections of the course.
137 | 
138 | </Tip>
139 | 
140 | Now we have our all-new optimizer, and we can try training with it. First, let's reload the model, to reset the changes to the weights from the training run we just did, and then we can compile it with the new optimizer:
141 | 
142 | ```py
143 | import tensorflow as tf
144 | 
145 | model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
146 | loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
147 | model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])
148 | ```
149 | 
150 | Now, we fit again:
151 | 
152 | ```py
153 | model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)
154 | ```
155 | 
156 | <Tip>
157 | 
158 | 💡 If you want to automatically upload your model to the Hub during training, you can pass along a `PushToHubCallback` in the `model.fit()` method. We will learn more about this in [Chapter 4](/course/chapter4/3)
159 | 
160 | </Tip>
161 | 
162 | ### Model predictions
163 | 
164 | <Youtube id="nx10eh4CoOs"/>
165 | 
166 | 
167 | Training and watching the loss go down is all very nice, but what if we want to actually get outputs from the trained model, either to compute some metrics, or to use the model in production? To do that, we can just use the `predict()` method. This will return the *logits* from the output head of the model, one per class.
168 | 
169 | ```py
170 | preds = model.predict(tf_validation_dataset)["logits"]
171 | ```
172 | 
173 | We can convert these logits into the model's class predictions by using `argmax` to find the highest logit, which corresponds to the most likely class:
174 | 
175 | ```py
176 | class_preds = np.argmax(preds, axis=1)
177 | print(preds.shape, class_preds.shape)
178 | ```
179 | 
180 | ```python out
181 | (408, 2) (408,)
182 | ```
183 | 
184 | Now, let's use those `preds` to compute some metrics! We can load the metrics associated with the MRPC dataset as easily as we loaded the dataset, this time with the `load_metric()` function. The object returned has a `compute()` method we can use to do the metric calculation:
185 | 
186 | ```py
187 | from datasets import load_metric
188 | 
189 | metric = load_metric("glue", "mrpc")
190 | metric.compute(predictions=class_preds, references=raw_datasets["validation"]["label"])
191 | ```
192 | 
193 | ```python out
194 | {'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}
195 | ```
196 | 
197 | The exact results you get may vary, as the random initialization of the model head might change the metrics it achieved. Here, we can see our model has an accuracy of 85.78% on the validation set and an F1 score of 89.97. Those are the two metrics used to evaluate results on the MRPC dataset for the GLUE benchmark. The table in the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf) reported an F1 score of 88.9 for the base model. That was the `uncased` model while we are currently using the `cased` model, which explains the better result.
198 | 
199 | This concludes the introduction to fine-tuning using the Keras API. An example of doing this for most common NLP tasks will be given in Chapter 7. If you would like to hone your skills on the Keras API, try to fine-tune a model on the GLUE SST-2 dataset, using the data processing you did in section 2.
200 | 


--------------------------------------------------------------------------------
/chapters/en/chapter3/6.mdx:
--------------------------------------------------------------------------------
  1 | <FrameworkSwitchCourse {fw} />
  2 | 
  3 | <!-- DISABLE-FRONTMATTER-SECTIONS -->
  4 | 
  5 | # End-of-chapter quiz
  6 | 
  7 | Test what you learned in this chapter!
  8 | 
  9 | ### 1. The `emotion` dataset contains Twitter messages labeled with emotions. Search for it in the [Hub](https://huggingface.co/datasets), and read the dataset card. Which of these is not one of its basic emotions?
 10 | 
 11 | <Question
 12 | 	choices={[
 13 | 		{
 14 | 			text: "Joy",
 15 | 			explain: "Try again — this emotion is present in that dataset!"
 16 | 		},
 17 | 		{
 18 | 			text: "Love",
 19 | 			explain: "Try again — this emotion is present in that dataset!"
 20 | 		},
 21 | 		{
 22 | 			text: "Confusion",
 23 | 			explain: "Correct! Confusion is not one of the six basic emotions.",
 24 |             correct: true
 25 | 		},
 26 |         {
 27 | 			text: "Surprise",
 28 | 			explain: "Surprise! Try another one!"
 29 | 		}
 30 | 	]}
 31 | />
 32 | 
 33 | ### 2. Search for the `ar_sarcasm` dataset in the [Hub](https://huggingface.co/datasets). Which task does it support?
 34 | 
 35 | <Question
 36 | 	choices={[
 37 | 		{
 38 | 			text: "Sentiment classification",
 39 | 			explain: "That's right! You can tell thanks to the tags.",
 40 |             correct: true
 41 | 		},
 42 | 		{
 43 | 			text: "Machine translation",
 44 | 			explain: "That's not it — take another look at the <a href='https://huggingface.co/datasets/ar_sarcasm'>dataset card</a>!"
 45 | 		},
 46 | 		{
 47 | 			text: "Named entity recognition",
 48 | 			explain: "That's not it — take another look at the <a href='https://huggingface.co/datasets/ar_sarcasm'>dataset card</a>!"
 49 | 		},
 50 |         {
 51 | 			text: "Question answering",
 52 | 			explain: "Alas, this question was not answered correctly. Try again!"
 53 | 		}
 54 | 	]}
 55 | />
 56 | 
 57 | ### 3. How does the BERT model expect a pair of sentences to be processed?
 58 | 
 59 | <Question
 60 | 	choices={[
 61 | 		{
 62 | 			text: "Tokens_of_sentence_1 [SEP] Tokens_of_sentence_2",
 63 | 			explain: "A <code>[SEP]</code> special token is needed to separate the two sentences, but that's not the only thing!"
 64 | 		},
 65 | 		{
 66 | 			text: "[CLS] Tokens_of_sentence_1 Tokens_of_sentence_2",
 67 | 			explain: "A <code>[CLS]</code> special token is required at the beginning, but that's not the only thing!"
 68 | 		},
 69 | 		{
 70 | 			text: "[CLS] Tokens_of_sentence_1 [SEP] Tokens_of_sentence_2 [SEP]",
 71 | 			explain: "That's correct!",
 72 |             correct: true
 73 | 		},
 74 |         {
 75 | 			text: "[CLS] Tokens_of_sentence_1 [SEP] Tokens_of_sentence_2",
 76 | 			explain: "A <code>[CLS]</code> special token is needed at the beginning as well as a <code>[SEP]</code> special token to separate the two sentences, but that's not all!"
 77 | 		}
 78 | 	]}
 79 | />
 80 | 
 81 | {#if fw === 'pt'}
 82 | ### 4. What are the benefits of the `Dataset.map()` method?
 83 | 
 84 | <Question
 85 | 	choices={[
 86 | 		{
 87 | 			text: "The results of the function are cached, so it won't take any time if we re-execute the code.",
 88 | 			explain: "That is indeed one of the neat benefits of this method! It's not the only one, though...",
 89 |             correct: true
 90 | 		},
 91 | 		{
 92 | 			text: "It can apply multiprocessing to go faster than applying the function on each element of the dataset.",
 93 | 			explain: "This is a neat feature of this method, but it's not the only one!",
 94 |             correct: true
 95 | 		},
 96 | 		{
 97 | 			text: "It does not load the whole dataset into memory, saving the results as soon as one element is processed.",
 98 | 			explain: "That's one advantage of this method. There are others, though!",
 99 |             correct: true
100 | 		},
101 | 	]}
102 | />
103 | 
104 | ### 5. What does dynamic padding mean?
105 | 
106 | <Question
107 | 	choices={[
108 | 		{
109 | 			text: "It's when you pad the inputs for each batch to the maximum length in the whole dataset.",
110 | 			explain: "It does imply padding when creating the batch, but not to the maximum length in the whole dataset."
111 | 		},
112 | 		{
113 | 			text: "It's when you pad your inputs when the batch is created, to the maximum length of the sentences inside that batch.",
114 | 			explain: "That's correct! The \"dynamic\" part comes from the fact that the size of each batch is determined at the time of creation, and all your batches might have different shapes as a result.",
115 |             correct: true
116 | 		},
117 | 		{
118 | 			text: "It's when you pad your inputs so that each sentence has the same number of tokens as the previous one in the dataset.",
119 | 			explain: "That's incorrect, plus it doesn't make sense to look at the order in the dataset since we shuffle it during training."
120 | 		},
121 | 	]}
122 | />
123 | 
124 | ### 6. What is the purpose of a collate function?
125 | 
126 | <Question
127 | 	choices={[
128 | 		{
129 | 			text: "It ensures all the sequences in the dataset have the same length.",
130 | 			explain: "A collate function is involved in handling individual batches, not the whole dataset. Additionally, we're talking about generic collate functions, not <code>DataCollatorWithPadding</code> specifically."
131 | 		},
132 | 		{
133 | 			text: "It puts together all the samples in a batch.",
134 | 			explain: "Correct! You can pass the collate function as an argument of a <code>DataLoader</code>. We used the <code>DataCollatorWithPadding</code> function, which pads all items in a batch so they have the same length.",
135 |             correct: true
136 | 		},
137 | 		{
138 | 			text: "It preprocesses the whole dataset.",
139 | 			explain: "That would be a preprocessing function, not a collate function."
140 | 		},
141 |         {
142 | 			text: "It truncates the sequences in the dataset.",
143 | 			explain: "A collate function is involved in handling individual batches, not the whole dataset. If you're interested in truncating, you can use the <code>truncate</code> argument of <code>tokenizer</code>."
144 | 		}
145 | 	]}
146 | />
147 | 
148 | ### 7. What happens when you instantiate one of the `AutoModelForXxx` classes with a pretrained language model (such as `bert-base-uncased`) that corresponds to a different task than the one for which it was trained?
149 | 
150 | <Question
151 | 	choices={[
152 | 		{
153 | 			text: "Nothing, but you get a warning.",
154 | 			explain: "You do get a warning, but that's not all!"
155 | 		},
156 | 		{
157 | 			text: "The head of the pretrained model is discarded and a new head suitable for the task is inserted instead.",
158 | 			explain: "Correct. For example, when we used <code>AutoModelForSequenceClassification</code> with <code>bert-base-uncased</code>, we got warnings when instantiating the model. The pretrained head is not used for the sequence classification task, so it's discarded and a new head is instantiated with random weights.",
159 |             correct: true
160 | 		},
161 | 		{
162 | 			text: "The head of the pretrained model is discarded.",
163 | 			explain: "Something else needs to happen. Try again!"
164 | 		},
165 |         {
166 | 			text: "Nothing, since the model can still be fine-tuned for the different task.",
167 | 			explain: "The head of the pretrained model was not trained to solve this task, so we should discard the head!"
168 | 		}
169 | 	]}
170 | />
171 | 
172 | ### 8. What's the purpose of `TrainingArguments`?
173 | 
174 | <Question
175 | 	choices={[
176 | 		{
177 | 			text: "It contains all the hyperparameters used for training and evaluation with the <code>Trainer</code>.",
178 | 			explain: "Correct!",
179 |             correct: true
180 | 		},
181 | 		{
182 | 			text: "It specifies the size of the model.",
183 | 			explain: "The model size is defined by the model configuration, not the class <code>TrainingArguments</code>."
184 | 		},
185 | 		{
186 | 			text: "It just contains the hyperparameters used for evaluation.",
187 | 			explain: "In the example, we specified where the model and its checkpoints will be saved. Try again!"
188 | 		},
189 |         {
190 | 			text: "It just contains the hyperparameters used for training.",
191 | 			explain: "In the example, we used an <code>evaluation_strategy</code> as well, so this impacts evaluation. Try again!"
192 | 		}
193 | 	]}
194 | />
195 | 
196 | ### 9. Why should you use the 🤗 Accelerate library?
197 | 
198 | <Question
199 | 	choices={[
200 | 		{
201 | 			text: "It provides access to faster models.",
202 | 			explain: "No, the 🤗 Accelerate library does not provide any models."
203 | 		},
204 | 		{
205 | 			text: "It provides a high-level API so I don't have to implement my own training loop.",
206 | 			explain: "This is what we did with <code>Trainer</code>, not the 🤗 Accelerate library. Try again!"
207 | 		},
208 | 		{
209 | 			text: "It makes our training loops work on distributed strategies",
210 | 			explain: "Correct! With 🤗 Accelerate, your training loops will work for multiple GPUs and TPUs.",
211 |             correct: true
212 | 		},
213 |         {
214 | 			text: "It provides more optimization functions.",
215 | 			explain: "No, the 🤗 Accelerate library does not provide any optimization functions."
216 | 		}
217 | 	]}
218 | />
219 | 
220 | {:else}
221 | ### 4. What happens when you instantiate one of the `TFAutoModelForXxx` classes with a pretrained language model (such as `bert-base-uncased`) that corresponds to a different task than the one for which it was trained?
222 | 
223 | <Question
224 | 	choices={[
225 | 		{
226 | 			text: "Nothing, but you get a warning.",
227 | 			explain: "You do get a warning, but that's not all!"
228 | 		},
229 | 		{
230 | 			text: "The head of the pretrained model is discarded and a new head suitable for the task is inserted instead.",
231 | 			explain: "Correct. For example, when we used <code>TFAutoModelForSequenceClassification</code> with <code>bert-base-uncased</code>, we got warnings when instantiating the model. The pretrained head is not used for the sequence classification task, so it's discarded and a new head is instantiated with random weights.",
232 |             correct: true
233 | 		},
234 | 		{
235 | 			text: "The head of the pretrained model is discarded.",
236 | 			explain: "Something else needs to happen. Try again!"
237 | 		},
238 |         {
239 | 			text: "Nothing, since the model can still be fine-tuned for the different task.",
240 | 			explain: "The head of the pretrained model was not trained to solve this task, so we should discard the head!"
241 | 		}
242 | 	]}
243 | />
244 | 
245 | ### 5. The TensorFlow models from `transformers` are already Keras models. What benefit does this offer?
246 | 
247 | <Question
248 | 	choices={[
249 | 		{
250 | 			text: "The models work on a TPU out of the box.",
251 | 			explain: "Almost! There are some small additional changes required. For example, you need to run everything in a <code>TPUStrategy</code> scope, including the initialization of the model."
252 | 		},
253 | 		{
254 | 			text: "You can leverage existing methods such as <code>compile()</code>, <code>fit()<c/ode>, and <code>predict()</code>.",
255 | 			explain: "Correct! Once you have the data, training on it requires very little work.",
256 |             correct: true
257 | 		},
258 | 		{
259 | 			text: "You get to learn Keras as well as transformers.",
260 | 			explain: "Correct, but we're looking for something else :)",
261 | 			correct: true
262 | 		},
263 |         {
264 | 			text: "You can easily compute metrics related to the dataset.",
265 | 			explain: "Keras helps us with training and evaluating the model, not computing dataset-related metrics."
266 | 		}
267 | 	]}
268 | />
269 | 
270 | ### 6. How can you define your own custom metric?
271 | 
272 | <Question
273 | 	choices={[
274 | 		{
275 | 			text: "By subclassing <code>tf.keras.metrics.Metric</code>.",
276 | 			explain: "Great!",
277 | 			correct: true
278 | 		},
279 | 		{
280 | 			text: "Using the Keras functional API.",
281 | 			explain: "Try again!"
282 | 		},
283 | 		{
284 | 			text: "By using a callable with signature <code>metric_fn(y_true, y_pred)</code>.",
285 | 			explain: "Correct!",
286 | 			correct: true
287 | 		},
288 |         {
289 | 			text: "By Googling it.",
290 | 			explain: "That's not the answer we're looking for, but it should help you find it.",
291 | 			correct: true
292 | 		}
293 | 	]}
294 | />
295 | 
296 | {/if}


--------------------------------------------------------------------------------