├── .gitignore
├── README.md
├── ds_zero2_1gpu.json
├── ds_zero3_1gpu.json
├── gpu_run_mlm.sh
├── requirements.txt
├── run_clm.py
├── run_clm_no_trainer.py
├── run_mlm.py
├── run_mlm_no_trainer.py
├── run_plm.py
├── sampled_20190101_20200611_v2.txt
└── zero3_gpu_run_mlm.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/linux,python,visualstudiocode,macos
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=linux,python,visualstudiocode,macos
  4 | 
  5 | ### Linux ###
  6 | *~
  7 | 
  8 | # temporary files which can be created if a process still has a handle open of a deleted file
  9 | .fuse_hidden*
 10 | 
 11 | # KDE directory preferences
 12 | .directory
 13 | 
 14 | # Linux trash folder which might appear on any partition or disk
 15 | .Trash-*
 16 | 
 17 | # .nfs files are created when an open file is removed but is still being accessed
 18 | .nfs*
 19 | 
 20 | ### macOS ###
 21 | # General
 22 | .DS_Store
 23 | .AppleDouble
 24 | .LSOverride
 25 | 
 26 | # Icon must end with two \r
 27 | Icon
 28 | 
 29 | 
 30 | # Thumbnails
 31 | ._*
 32 | 
 33 | # Files that might appear in the root of a volume
 34 | .DocumentRevisions-V100
 35 | .fseventsd
 36 | .Spotlight-V100
 37 | .TemporaryItems
 38 | .Trashes
 39 | .VolumeIcon.icns
 40 | .com.apple.timemachine.donotpresent
 41 | 
 42 | # Directories potentially created on remote AFP share
 43 | .AppleDB
 44 | .AppleDesktop
 45 | Network Trash Folder
 46 | Temporary Items
 47 | .apdisk
 48 | 
 49 | ### Python ###
 50 | # Byte-compiled / optimized / DLL files
 51 | __pycache__/
 52 | *.py[cod]
 53 | *$py.class
 54 | 
 55 | # C extensions
 56 | *.so
 57 | 
 58 | # Distribution / packaging
 59 | .Python
 60 | build/
 61 | develop-eggs/
 62 | dist/
 63 | downloads/
 64 | eggs/
 65 | .eggs/
 66 | parts/
 67 | sdist/
 68 | var/
 69 | wheels/
 70 | pip-wheel-metadata/
 71 | share/python-wheels/
 72 | *.egg-info/
 73 | .installed.cfg
 74 | *.egg
 75 | MANIFEST
 76 | 
 77 | # PyInstaller
 78 | #  Usually these files are written by a python script from a template
 79 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 80 | *.manifest
 81 | *.spec
 82 | 
 83 | # Installer logs
 84 | pip-log.txt
 85 | pip-delete-this-directory.txt
 86 | 
 87 | # Unit test / coverage reports
 88 | htmlcov/
 89 | .tox/
 90 | .nox/
 91 | .coverage
 92 | .coverage.*
 93 | .cache
 94 | nosetests.xml
 95 | coverage.xml
 96 | *.cover
 97 | *.py,cover
 98 | .hypothesis/
 99 | .pytest_cache/
100 | pytestdebug.log
101 | 
102 | # Translations
103 | *.mo
104 | *.pot
105 | 
106 | # Django stuff:
107 | *.log
108 | local_settings.py
109 | db.sqlite3
110 | db.sqlite3-journal
111 | 
112 | # Flask stuff:
113 | instance/
114 | .webassets-cache
115 | 
116 | # Scrapy stuff:
117 | .scrapy
118 | 
119 | # Sphinx documentation
120 | docs/_build/
121 | doc/_build/
122 | 
123 | # PyBuilder
124 | target/
125 | 
126 | # Jupyter Notebook
127 | .ipynb_checkpoints
128 | 
129 | # IPython
130 | profile_default/
131 | ipython_config.py
132 | 
133 | # pyenv
134 | .python-version
135 | 
136 | # pipenv
137 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
138 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
139 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
140 | #   install all needed dependencies.
141 | #Pipfile.lock
142 | 
143 | # poetry
144 | #poetry.lock
145 | 
146 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
147 | __pypackages__/
148 | 
149 | # Celery stuff
150 | celerybeat-schedule
151 | celerybeat.pid
152 | 
153 | # SageMath parsed files
154 | *.sage.py
155 | 
156 | # Environments
157 | # .env
158 | .env/
159 | .venv/
160 | env/
161 | venv/
162 | ENV/
163 | env.bak/
164 | venv.bak/
165 | pythonenv*
166 | 
167 | # Spyder project settings
168 | .spyderproject
169 | .spyproject
170 | 
171 | # Rope project settings
172 | .ropeproject
173 | 
174 | # mkdocs documentation
175 | /site
176 | 
177 | # mypy
178 | .mypy_cache/
179 | .dmypy.json
180 | dmypy.json
181 | 
182 | # Pyre type checker
183 | .pyre/
184 | 
185 | # pytype static type analyzer
186 | .pytype/
187 | 
188 | # operating system-related files
189 | # file properties cache/storage on macOS
190 | *.DS_Store
191 | # thumbnail cache on Windows
192 | Thumbs.db
193 | 
194 | # profiling data
195 | .prof
196 | 
197 | 
198 | ### VisualStudioCode ###
199 | .vscode/
200 | .vscode/*
201 | !.vscode/settings.json
202 | !.vscode/tasks.json
203 | !.vscode/launch.json
204 | !.vscode/extensions.json
205 | *.code-workspace
206 | 
207 | ### VisualStudioCode Patch ###
208 | # Ignore all local history of files
209 | .history
210 | .ionide
211 | 
212 | # End of https://www.toptal.com/developers/gitignore/api/linux,python,visualstudiocode,macos
213 | #
214 | .env
215 | runs/
216 | test*/
217 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | > Fork from https://github.com/huggingface/transformers/tree/86d5fb0b360e68de46d40265e7c707fe68c8015b/examples/pytorch/language-modeling at 2021.05.17.
  2 | 
  3 | 
  4 | <!---
  5 | Copyright 2020 The HuggingFace Team. All rights reserved.
  6 | 
  7 | Licensed under the Apache License, Version 2.0 (the "License");
  8 | you may not use this file except in compliance with the License.
  9 | You may obtain a copy of the License at
 10 | 
 11 |     http://www.apache.org/licenses/LICENSE-2.0
 12 | 
 13 | Unless required by applicable law or agreed to in writing, software
 14 | distributed under the License is distributed on an "AS IS" BASIS,
 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | See the License for the specific language governing permissions and
 17 | limitations under the License.
 18 | -->
 19 | 
 20 | ## Language model training
 21 | 
 22 | Fine-tuning (or training from scratch) the library models for language modeling on a text dataset for GPT, GPT-2,
 23 | ALBERT, BERT, DistilBERT, RoBERTa, XLNet... GPT and GPT-2 are trained or fine-tuned using a causal language modeling
 24 | (CLM) loss while ALBERT, BERT, DistilBERT and RoBERTa are trained or fine-tuned using a masked language modeling (MLM)
 25 | loss. XLNet uses permutation language modeling (PLM), you can find more information about the differences between those
 26 | objectives in our [model summary](https://huggingface.co/transformers/model_summary.html).
 27 | 
 28 | There are two sets of scripts provided. The first set leverages the Trainer API. The second set with `no_trainer` in the suffix uses a custom training loop and leverages the 🤗 Accelerate library . Both sets use the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.
 29 | 
 30 | **Note:** The old script `run_language_modeling.py` is still available [here](https://github.com/huggingface/transformers/blob/master/examples/legacy/run_language_modeling.py).
 31 | 
 32 | The following examples, will run on datasets hosted on our [hub](https://huggingface.co/datasets) or with your own
 33 | text files for training and validation. We give examples of both below.
 34 | 
 35 | ### GPT-2/GPT and causal language modeling
 36 | 
 37 | The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before
 38 | the tokenization). The loss here is that of causal language modeling.
 39 | 
 40 | ```bash
 41 | python run_clm.py \
 42 |     --model_name_or_path gpt2 \
 43 |     --dataset_name wikitext \
 44 |     --dataset_config_name wikitext-2-raw-v1 \
 45 |     --do_train \
 46 |     --do_eval \
 47 |     --output_dir /tmp/test-clm
 48 | ```
 49 | 
 50 | This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches
 51 | a score of ~20 perplexity once fine-tuned on the dataset.
 52 | 
 53 | To run on your own training and validation files, use the following command:
 54 | 
 55 | ```bash
 56 | python run_clm.py \
 57 |     --model_name_or_path gpt2 \
 58 |     --train_file path_to_train_file \
 59 |     --validation_file path_to_validation_file \
 60 |     --do_train \
 61 |     --do_eval \
 62 |     --output_dir /tmp/test-clm
 63 | ```
 64 | 
 65 | This uses the built in HuggingFace `Trainer` for training. If you want to use a custom training loop, you can utilize or adapt the `run_clm_no_trainer.py` script. Take a look at the script for a list of supported arguments. An example is shown below:
 66 | 
 67 | ```bash
 68 | python run_clm_no_trainer.py \
 69 |     --dataset_name wikitext \
 70 |     --dataset_config_name wikitext-2-raw-v1 \
 71 |     --model_name_or_path gpt2 \
 72 |     --output_dir /tmp/test-clm
 73 | ```
 74 | 
 75 | ### RoBERTa/BERT/DistilBERT and masked language modeling
 76 | 
 77 | The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different
 78 | as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their
 79 | pre-training: masked language modeling.
 80 | 
 81 | In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore,
 82 | converge slightly slower (over-fitting takes more epochs).
 83 | 
 84 | ```bash
 85 | python run_mlm.py \
 86 |     --model_name_or_path roberta-base \
 87 |     --dataset_name wikitext \
 88 |     --dataset_config_name wikitext-2-raw-v1 \
 89 |     --do_train \
 90 |     --do_eval \
 91 |     --output_dir /tmp/test-mlm
 92 | ```
 93 | 
 94 | To run on your own training and validation files, use the following command:
 95 | 
 96 | ```bash
 97 | python run_mlm.py \
 98 |     --model_name_or_path roberta-base \
 99 |     --train_file path_to_train_file \
100 |     --validation_file path_to_validation_file \
101 |     --do_train \
102 |     --do_eval \
103 |     --output_dir /tmp/test-mlm
104 | ```
105 | 
106 | If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script
107 | concatenates all texts and then splits them in blocks of the same length).
108 | 
109 | This uses the built in HuggingFace `Trainer` for training. If you want to use a custom training loop, you can utilize or adapt the `run_mlm_no_trainer.py` script. Take a look at the script for a list of supported arguments. An example is shown below:
110 | 
111 | ```bash
112 | python run_mlm_no_trainer.py \
113 |     --dataset_name wikitext \
114 |     --dataset_config_name wikitext-2-raw-v1 \
115 |     --model_name_or_path roberta-base \
116 |     --output_dir /tmp/test-mlm
117 | ```
118 | 
119 | **Note:** On TPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make
120 | sure all your batches have the same length.
121 | 
122 | ### Whole word masking
123 | 
124 | This part was moved to `examples/research_projects/mlm_wwm`.
125 | 
126 | ### XLNet and permutation language modeling
127 | 
128 | XLNet uses a different training objective, which is permutation language modeling. It is an autoregressive method
129 | to learn bidirectional contexts by maximizing the expected likelihood over all permutations of the input
130 | sequence factorization order.
131 | 
132 | We use the `--plm_probability` flag to define the ratio of length of a span of masked tokens to surrounding
133 | context length for permutation language modeling.
134 | 
135 | The `--max_span_length` flag may also be used to limit the length of a span of masked tokens used
136 | for permutation language modeling.
137 | 
138 | Here is how to fine-tune XLNet on wikitext-2:
139 | 
140 | ```bash
141 | python run_plm.py \
142 |     --model_name_or_path=xlnet-base-cased \
143 |     --dataset_name wikitext \
144 |     --dataset_config_name wikitext-2-raw-v1 \
145 |     --do_train \
146 |     --do_eval \
147 |     --output_dir /tmp/test-plm
148 | ```
149 | 
150 | To fine-tune it on your own training and validation file, run:
151 | 
152 | ```bash
153 | python run_plm.py \
154 |     --model_name_or_path=xlnet-base-cased \
155 |     --train_file path_to_train_file \
156 |     --validation_file path_to_validation_file \
157 |     --do_train \
158 |     --do_eval \
159 |     --output_dir /tmp/test-plm
160 | ```
161 | 
162 | If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script
163 | concatenates all texts and then splits them in blocks of the same length).
164 | 
165 | **Note:** On TPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make
166 | sure all your batches have the same length.
167 | 


--------------------------------------------------------------------------------
/ds_zero2_1gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "optimizer": {
11 |         "type": "AdamW",
12 |         "params": {
13 |             "lr": "auto",
14 |             "betas": "auto",
15 |             "eps": "auto",
16 |             "weight_decay": "auto"
17 |         }
18 |     },
19 |     "scheduler": {
20 |         "type": "WarmupLR",
21 |         "params": {
22 |             "warmup_min_lr": "auto",
23 |             "warmup_max_lr": "auto",
24 |             "warmup_num_steps": "auto"
25 |         }
26 |     },
27 |     "zero_optimization": {
28 |         "stage": 2,
29 |         "allgather_partitions": true,
30 |         "allgather_bucket_size": 2e8,
31 |         "overlap_comm": true,
32 |         "reduce_scatter": true,
33 |         "reduce_bucket_size": 2e8,
34 |         "contiguous_gradients": true,
35 |         "cpu_offload": true
36 |     },
37 |     "gradient_accumulation_steps": "auto",
38 |     "gradient_clipping": "auto",
39 |     "train_batch_size": "auto",
40 |     "train_micro_batch_size_per_gpu": "auto"
41 | }


--------------------------------------------------------------------------------
/ds_zero3_1gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "optimizer": {
11 |         "type": "AdamW",
12 |         "params": {
13 |             "lr": "auto",
14 |             "betas": "auto",
15 |             "eps": "auto",
16 |             "weight_decay": "auto"
17 |         }
18 |     },
19 |     "scheduler": {
20 |         "type": "WarmupLR",
21 |         "params": {
22 |             "warmup_min_lr": "auto",
23 |             "warmup_max_lr": "auto",
24 |             "warmup_num_steps": "auto"
25 |         }
26 |     },
27 |     "zero_optimization": {
28 |         "stage": 3,
29 |         "offload_optimizer": {
30 |             "device": "cpu",
31 |             "pin_memory": true
32 |         },
33 |         "offload_param": {
34 |             "device": "cpu",
35 |             "pin_memory": true
36 |         },
37 |         "overlap_comm": true,
38 |         "contiguous_gradients": true,
39 |         "sub_group_size": 1e14,
40 |         "reduce_bucket_size": "auto",
41 |         "stage3_prefetch_bucket_size": "auto",
42 |         "stage3_param_persistence_threshold": "auto",
43 |         "stage3_max_live_parameters": 1e9,
44 |         "stage3_max_reuse_distance": 1e9,
45 |         "stage3_gather_fp16_weights_on_model_save": true
46 |     },
47 |     "gradient_accumulation_steps": "auto",
48 |     "gradient_clipping": "auto",
49 |     "steps_per_print": 2000,
50 |     "train_batch_size": "auto",
51 |     "train_micro_batch_size_per_gpu": "auto",
52 |     "wall_clock_breakdown": false
53 | }


--------------------------------------------------------------------------------
/gpu_run_mlm.sh:
--------------------------------------------------------------------------------
 1 | rm -rf ./test-bert-zero2-multigpu
 2 | 
 3 | export BS=32
 4 | export NCCL_DEBUG=INFO
 5 | export NCCL_SHM_DISABLE=1
 6 | 
 7 | deepspeed run_mlm.py \
 8 | --seed 42 \
 9 | --model_type bert \
10 | --tokenizer_name beomi/KcELECTRA-base \
11 | --train_file ./sampled_20190101_20200611_v2.txt \
12 | --num_train_epochs 2 \
13 | --per_device_train_batch_size $BS \
14 | --per_device_eval_batch_size $BS \
15 | --do_train \
16 | --output_dir ./test-bert-zero2-multigpu \
17 | --fp16 \
18 | --logging_first_step \
19 | --max_seq_length 300 \
20 | --deepspeed ./ds_zero2_1gpu.json  \
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 | 


--------------------------------------------------------------------------------
/run_clm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
 18 | 
 19 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 20 | https://huggingface.co/models?filter=causal-lm
 21 | """
 22 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
 23 | 
 24 | import logging
 25 | import math
 26 | import os
 27 | import sys
 28 | from dataclasses import dataclass, field
 29 | from typing import Optional
 30 | 
 31 | from datasets import load_dataset
 32 | 
 33 | import transformers
 34 | from transformers import (
 35 |     CONFIG_MAPPING,
 36 |     MODEL_FOR_CAUSAL_LM_MAPPING,
 37 |     AutoConfig,
 38 |     AutoModelForCausalLM,
 39 |     AutoTokenizer,
 40 |     HfArgumentParser,
 41 |     Trainer,
 42 |     TrainingArguments,
 43 |     default_data_collator,
 44 |     set_seed,
 45 | )
 46 | from transformers.testing_utils import CaptureLogger
 47 | from transformers.trainer_utils import get_last_checkpoint, is_main_process
 48 | from transformers.utils import check_min_version
 49 | 
 50 | 
 51 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 52 | check_min_version("4.7.0.dev0")
 53 | 
 54 | logger = logging.getLogger(__name__)
 55 | 
 56 | 
 57 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
 58 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 59 | 
 60 | 
 61 | @dataclass
 62 | class ModelArguments:
 63 |     """
 64 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
 65 |     """
 66 | 
 67 |     model_name_or_path: Optional[str] = field(
 68 |         default=None,
 69 |         metadata={
 70 |             "help": "The model checkpoint for weights initialization."
 71 |             "Don't set if you want to train a model from scratch."
 72 |         },
 73 |     )
 74 |     model_type: Optional[str] = field(
 75 |         default=None,
 76 |         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
 77 |     )
 78 |     config_name: Optional[str] = field(
 79 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
 80 |     )
 81 |     tokenizer_name: Optional[str] = field(
 82 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
 83 |     )
 84 |     cache_dir: Optional[str] = field(
 85 |         default=None,
 86 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
 87 |     )
 88 |     use_fast_tokenizer: bool = field(
 89 |         default=True,
 90 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
 91 |     )
 92 |     model_revision: str = field(
 93 |         default="main",
 94 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
 95 |     )
 96 |     use_auth_token: bool = field(
 97 |         default=False,
 98 |         metadata={
 99 |             "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
100 |             "with private models)."
101 |         },
102 |     )
103 | 
104 | 
105 | @dataclass
106 | class DataTrainingArguments:
107 |     """
108 |     Arguments pertaining to what data we are going to input our model for training and eval.
109 |     """
110 | 
111 |     dataset_name: Optional[str] = field(
112 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
113 |     )
114 |     dataset_config_name: Optional[str] = field(
115 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
116 |     )
117 |     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
118 |     validation_file: Optional[str] = field(
119 |         default=None,
120 |         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
121 |     )
122 |     max_train_samples: Optional[int] = field(
123 |         default=None,
124 |         metadata={
125 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
126 |             "value if set."
127 |         },
128 |     )
129 |     max_eval_samples: Optional[int] = field(
130 |         default=None,
131 |         metadata={
132 |             "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
133 |             "value if set."
134 |         },
135 |     )
136 | 
137 |     block_size: Optional[int] = field(
138 |         default=None,
139 |         metadata={
140 |             "help": "Optional input sequence length after tokenization. "
141 |             "The training dataset will be truncated in block of this size for training. "
142 |             "Default to the model max input length for single sentence inputs (take into account special tokens)."
143 |         },
144 |     )
145 |     overwrite_cache: bool = field(
146 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
147 |     )
148 |     validation_split_percentage: Optional[int] = field(
149 |         default=5,
150 |         metadata={
151 |             "help": "The percentage of the train set used as validation set in case there's no validation split"
152 |         },
153 |     )
154 |     preprocessing_num_workers: Optional[int] = field(
155 |         default=None,
156 |         metadata={"help": "The number of processes to use for the preprocessing."},
157 |     )
158 | 
159 |     def __post_init__(self):
160 |         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
161 |             raise ValueError("Need either a dataset name or a training/validation file.")
162 |         else:
163 |             if self.train_file is not None:
164 |                 extension = self.train_file.split(".")[-1]
165 |                 assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
166 |             if self.validation_file is not None:
167 |                 extension = self.validation_file.split(".")[-1]
168 |                 assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
169 | 
170 | 
171 | def main():
172 |     # See all possible arguments in src/transformers/training_args.py
173 |     # or by passing the --help flag to this script.
174 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
175 | 
176 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
177 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
178 |         # If we pass only one argument to the script and it's the path to a json file,
179 |         # let's parse it to get our arguments.
180 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
181 |     else:
182 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
183 | 
184 |     # Detecting last checkpoint.
185 |     last_checkpoint = None
186 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
187 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
188 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
189 |             raise ValueError(
190 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
191 |                 "Use --overwrite_output_dir to overcome."
192 |             )
193 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
194 |             logger.info(
195 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
196 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
197 |             )
198 | 
199 |     # Setup logging
200 |     logging.basicConfig(
201 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
202 |         datefmt="%m/%d/%Y %H:%M:%S",
203 |         handlers=[logging.StreamHandler(sys.stdout)],
204 |     )
205 |     logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
206 | 
207 |     # Log on each process the small summary:
208 |     logger.warning(
209 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
210 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
211 |     )
212 |     # Set the verbosity to info of the Transformers logger (on main process only):
213 |     if is_main_process(training_args.local_rank):
214 |         transformers.utils.logging.set_verbosity_info()
215 |         transformers.utils.logging.enable_default_handler()
216 |         transformers.utils.logging.enable_explicit_format()
217 |     logger.info(f"Training/evaluation parameters {training_args}")
218 | 
219 |     # Set seed before initializing model.
220 |     set_seed(training_args.seed)
221 | 
222 |     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
223 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
224 |     # (the dataset will be downloaded automatically from the datasets Hub).
225 |     #
226 |     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
227 |     # 'text' is found. You can easily tweak this behavior (see below).
228 |     #
229 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
230 |     # download the dataset.
231 |     if data_args.dataset_name is not None:
232 |         # Downloading and loading a dataset from the hub.
233 |         datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
234 |         if "validation" not in datasets.keys():
235 |             datasets["validation"] = load_dataset(
236 |                 data_args.dataset_name,
237 |                 data_args.dataset_config_name,
238 |                 split=f"train[:{data_args.validation_split_percentage}%]",
239 |                 cache_dir=model_args.cache_dir,
240 |             )
241 |             datasets["train"] = load_dataset(
242 |                 data_args.dataset_name,
243 |                 data_args.dataset_config_name,
244 |                 split=f"train[{data_args.validation_split_percentage}%:]",
245 |                 cache_dir=model_args.cache_dir,
246 |             )
247 |     else:
248 |         data_files = {}
249 |         if data_args.train_file is not None:
250 |             data_files["train"] = data_args.train_file
251 |         if data_args.validation_file is not None:
252 |             data_files["validation"] = data_args.validation_file
253 |         extension = (
254 |             data_args.train_file.split(".")[-1]
255 |             if data_args.train_file is not None
256 |             else data_args.validation_file.split(".")[-1]
257 |         )
258 |         if extension == "txt":
259 |             extension = "text"
260 |         datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
261 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
262 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
263 | 
264 |     # Load pretrained model and tokenizer
265 |     #
266 |     # Distributed training:
267 |     # The .from_pretrained methods guarantee that only one local process can concurrently
268 |     # download model & vocab.
269 | 
270 |     config_kwargs = {
271 |         "cache_dir": model_args.cache_dir,
272 |         "revision": model_args.model_revision,
273 |         "use_auth_token": True if model_args.use_auth_token else None,
274 |     }
275 |     if model_args.config_name:
276 |         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
277 |     elif model_args.model_name_or_path:
278 |         config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
279 |     else:
280 |         config = CONFIG_MAPPING[model_args.model_type]()
281 |         logger.warning("You are instantiating a new config instance from scratch.")
282 | 
283 |     tokenizer_kwargs = {
284 |         "cache_dir": model_args.cache_dir,
285 |         "use_fast": model_args.use_fast_tokenizer,
286 |         "revision": model_args.model_revision,
287 |         "use_auth_token": True if model_args.use_auth_token else None,
288 |     }
289 |     if model_args.tokenizer_name:
290 |         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
291 |     elif model_args.model_name_or_path:
292 |         tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
293 |     else:
294 |         raise ValueError(
295 |             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
296 |             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
297 |         )
298 | 
299 |     if model_args.model_name_or_path:
300 |         model = AutoModelForCausalLM.from_pretrained(
301 |             model_args.model_name_or_path,
302 |             from_tf=bool(".ckpt" in model_args.model_name_or_path),
303 |             config=config,
304 |             cache_dir=model_args.cache_dir,
305 |             revision=model_args.model_revision,
306 |             use_auth_token=True if model_args.use_auth_token else None,
307 |         )
308 |     else:
309 |         logger.info("Training new model from scratch")
310 |         model = AutoModelForCausalLM.from_config(config)
311 | 
312 |     model.resize_token_embeddings(len(tokenizer))
313 | 
314 |     # Preprocessing the datasets.
315 |     # First we tokenize all the texts.
316 |     if training_args.do_train:
317 |         column_names = datasets["train"].column_names
318 |     else:
319 |         column_names = datasets["validation"].column_names
320 |     text_column_name = "text" if "text" in column_names else column_names[0]
321 | 
322 |     # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
323 |     tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
324 | 
325 |     def tokenize_function(examples):
326 |         with CaptureLogger(tok_logger) as cl:
327 |             output = tokenizer(examples[text_column_name])
328 |         # clm input could be much much longer than block_size
329 |         if "Token indices sequence length is longer than the" in cl.out:
330 |             tok_logger.warning(
331 |                 "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model."
332 |             )
333 |         return output
334 | 
335 |     tokenized_datasets = datasets.map(
336 |         tokenize_function,
337 |         batched=True,
338 |         num_proc=data_args.preprocessing_num_workers,
339 |         remove_columns=column_names,
340 |         load_from_cache_file=not data_args.overwrite_cache,
341 |     )
342 | 
343 |     if data_args.block_size is None:
344 |         block_size = tokenizer.model_max_length
345 |         if block_size > 1024:
346 |             logger.warning(
347 |                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
348 |                 "Picking 1024 instead. You can change that default value by passing --block_size xxx."
349 |             )
350 |         block_size = 1024
351 |     else:
352 |         if data_args.block_size > tokenizer.model_max_length:
353 |             logger.warning(
354 |                 f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
355 |                 f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
356 |             )
357 |         block_size = min(data_args.block_size, tokenizer.model_max_length)
358 | 
359 |     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
360 |     def group_texts(examples):
361 |         # Concatenate all texts.
362 |         concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
363 |         total_length = len(concatenated_examples[list(examples.keys())[0]])
364 |         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
365 |         # customize this part to your needs.
366 |         total_length = (total_length // block_size) * block_size
367 |         # Split by chunks of max_len.
368 |         result = {
369 |             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
370 |             for k, t in concatenated_examples.items()
371 |         }
372 |         result["labels"] = result["input_ids"].copy()
373 |         return result
374 | 
375 |     # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
376 |     # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
377 |     # to preprocess.
378 |     #
379 |     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
380 |     # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
381 | 
382 |     lm_datasets = tokenized_datasets.map(
383 |         group_texts,
384 |         batched=True,
385 |         num_proc=data_args.preprocessing_num_workers,
386 |         load_from_cache_file=not data_args.overwrite_cache,
387 |     )
388 | 
389 |     if training_args.do_train:
390 |         if "train" not in tokenized_datasets:
391 |             raise ValueError("--do_train requires a train dataset")
392 |         train_dataset = lm_datasets["train"]
393 |         if data_args.max_train_samples is not None:
394 |             train_dataset = train_dataset.select(range(data_args.max_train_samples))
395 | 
396 |     if training_args.do_eval:
397 |         if "validation" not in tokenized_datasets:
398 |             raise ValueError("--do_eval requires a validation dataset")
399 |         eval_dataset = lm_datasets["validation"]
400 |         if data_args.max_eval_samples is not None:
401 |             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
402 | 
403 |     # Initialize our Trainer
404 |     trainer = Trainer(
405 |         model=model,
406 |         args=training_args,
407 |         train_dataset=train_dataset if training_args.do_train else None,
408 |         eval_dataset=eval_dataset if training_args.do_eval else None,
409 |         tokenizer=tokenizer,
410 |         # Data collator will default to DataCollatorWithPadding, so we change it.
411 |         data_collator=default_data_collator,
412 |     )
413 | 
414 |     # Training
415 |     if training_args.do_train:
416 |         checkpoint = None
417 |         if training_args.resume_from_checkpoint is not None:
418 |             checkpoint = training_args.resume_from_checkpoint
419 |         elif last_checkpoint is not None:
420 |             checkpoint = last_checkpoint
421 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
422 |         trainer.save_model()  # Saves the tokenizer too for easy upload
423 | 
424 |         metrics = train_result.metrics
425 | 
426 |         max_train_samples = (
427 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
428 |         )
429 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
430 | 
431 |         trainer.log_metrics("train", metrics)
432 |         trainer.save_metrics("train", metrics)
433 |         trainer.save_state()
434 | 
435 |     # Evaluation
436 |     if training_args.do_eval:
437 |         logger.info("*** Evaluate ***")
438 | 
439 |         metrics = trainer.evaluate()
440 | 
441 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
442 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
443 |         perplexity = math.exp(metrics["eval_loss"])
444 |         metrics["perplexity"] = perplexity
445 | 
446 |         trainer.log_metrics("eval", metrics)
447 |         trainer.save_metrics("eval", metrics)
448 | 
449 |     if training_args.push_to_hub:
450 |         kwargs = {"finetuned_from": model_args.model_name_or_path, "tags": "text-generation"}
451 |         if data_args.dataset_name is not None:
452 |             kwargs["dataset_tags"] = data_args.dataset_name
453 |             if data_args.dataset_config_name is not None:
454 |                 kwargs["dataset_args"] = data_args.dataset_config_name
455 |                 kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
456 |             else:
457 |                 kwargs["dataset"] = data_args.dataset_name
458 | 
459 |         trainer.push_to_hub(**kwargs)
460 | 
461 | 
462 | def _mp_fn(index):
463 |     # For xla_spawn (TPUs)
464 |     main()
465 | 
466 | 
467 | if __name__ == "__main__":
468 |     main()
469 | 


--------------------------------------------------------------------------------
/run_clm_no_trainer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...)
 18 | on a text file or a dataset without using HuggingFace Trainer.
 19 | 
 20 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 21 | https://huggingface.co/models?filter=causal-lm
 22 | """
 23 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
 24 | 
 25 | import argparse
 26 | import logging
 27 | import math
 28 | import os
 29 | import random
 30 | 
 31 | import datasets
 32 | import torch
 33 | from datasets import load_dataset
 34 | from torch.utils.data.dataloader import DataLoader
 35 | from tqdm.auto import tqdm
 36 | 
 37 | import transformers
 38 | from accelerate import Accelerator
 39 | from transformers import (
 40 |     CONFIG_MAPPING,
 41 |     MODEL_MAPPING,
 42 |     AdamW,
 43 |     AutoConfig,
 44 |     AutoModelForCausalLM,
 45 |     AutoTokenizer,
 46 |     SchedulerType,
 47 |     default_data_collator,
 48 |     get_scheduler,
 49 |     set_seed,
 50 | )
 51 | 
 52 | 
 53 | logger = logging.getLogger(__name__)
 54 | MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
 55 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 56 | 
 57 | 
 58 | def parse_args():
 59 |     parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task")
 60 |     parser.add_argument(
 61 |         "--dataset_name",
 62 |         type=str,
 63 |         default=None,
 64 |         help="The name of the dataset to use (via the datasets library).",
 65 |     )
 66 |     parser.add_argument(
 67 |         "--dataset_config_name",
 68 |         type=str,
 69 |         default=None,
 70 |         help="The configuration name of the dataset to use (via the datasets library).",
 71 |     )
 72 |     parser.add_argument(
 73 |         "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
 74 |     )
 75 |     parser.add_argument(
 76 |         "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
 77 |     )
 78 |     parser.add_argument(
 79 |         "--validation_split_percentage",
 80 |         default=5,
 81 |         help="The percentage of the train set used as validation set in case there's no validation split",
 82 |     )
 83 |     parser.add_argument(
 84 |         "--model_name_or_path",
 85 |         type=str,
 86 |         help="Path to pretrained model or model identifier from huggingface.co/models.",
 87 |         required=True,
 88 |     )
 89 |     parser.add_argument(
 90 |         "--config_name",
 91 |         type=str,
 92 |         default=None,
 93 |         help="Pretrained config name or path if not the same as model_name",
 94 |     )
 95 |     parser.add_argument(
 96 |         "--tokenizer_name",
 97 |         type=str,
 98 |         default=None,
 99 |         help="Pretrained tokenizer name or path if not the same as model_name",
100 |     )
101 |     parser.add_argument(
102 |         "--use_slow_tokenizer",
103 |         action="store_true",
104 |         help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
105 |     )
106 |     parser.add_argument(
107 |         "--per_device_train_batch_size",
108 |         type=int,
109 |         default=8,
110 |         help="Batch size (per device) for the training dataloader.",
111 |     )
112 |     parser.add_argument(
113 |         "--per_device_eval_batch_size",
114 |         type=int,
115 |         default=8,
116 |         help="Batch size (per device) for the evaluation dataloader.",
117 |     )
118 |     parser.add_argument(
119 |         "--learning_rate",
120 |         type=float,
121 |         default=5e-5,
122 |         help="Initial learning rate (after the potential warmup period) to use.",
123 |     )
124 |     parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
125 |     parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
126 |     parser.add_argument(
127 |         "--max_train_steps",
128 |         type=int,
129 |         default=None,
130 |         help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
131 |     )
132 |     parser.add_argument(
133 |         "--gradient_accumulation_steps",
134 |         type=int,
135 |         default=1,
136 |         help="Number of updates steps to accumulate before performing a backward/update pass.",
137 |     )
138 |     parser.add_argument(
139 |         "--lr_scheduler_type",
140 |         type=SchedulerType,
141 |         default="linear",
142 |         help="The scheduler type to use.",
143 |         choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
144 |     )
145 |     parser.add_argument(
146 |         "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
147 |     )
148 |     parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
149 |     parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
150 |     parser.add_argument(
151 |         "--model_type",
152 |         type=str,
153 |         default=None,
154 |         help="Model type to use if training from scratch.",
155 |         choices=MODEL_TYPES,
156 |     )
157 |     parser.add_argument(
158 |         "--block_size",
159 |         type=int,
160 |         default=None,
161 |         help="Optional input sequence length after tokenization. The training dataset will be truncated in block of this size for training. Default to the model max input length for single sentence inputs (take into account special tokens).",
162 |     )
163 |     parser.add_argument(
164 |         "--preprocessing_num_workers",
165 |         type=int,
166 |         default=None,
167 |         help="The number of processes to use for the preprocessing.",
168 |     )
169 |     parser.add_argument(
170 |         "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
171 |     )
172 | 
173 |     args = parser.parse_args()
174 | 
175 |     # Sanity checks
176 |     if args.dataset_name is None and args.train_file is None and args.validation_file is None:
177 |         raise ValueError("Need either a dataset name or a training/validation file.")
178 |     else:
179 |         if args.train_file is not None:
180 |             extension = args.train_file.split(".")[-1]
181 |             assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
182 |         if args.validation_file is not None:
183 |             extension = args.validation_file.split(".")[-1]
184 |             assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
185 | 
186 |     if args.output_dir is not None:
187 |         os.makedirs(args.output_dir, exist_ok=True)
188 | 
189 |     return args
190 | 
191 | 
192 | def main():
193 |     args = parse_args()
194 | 
195 |     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
196 |     accelerator = Accelerator()
197 |     # Make one log on every process with the configuration for debugging.
198 |     logging.basicConfig(
199 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
200 |         datefmt="%m/%d/%Y %H:%M:%S",
201 |         level=logging.INFO,
202 |     )
203 |     logger.info(accelerator.state)
204 | 
205 |     # Setup logging, we only want one process per machine to log things on the screen.
206 |     # accelerator.is_local_main_process is only True for one process per machine.
207 |     logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
208 |     if accelerator.is_local_main_process:
209 |         datasets.utils.logging.set_verbosity_warning()
210 |         transformers.utils.logging.set_verbosity_info()
211 |     else:
212 |         datasets.utils.logging.set_verbosity_error()
213 |         transformers.utils.logging.set_verbosity_error()
214 | 
215 |     # If passed along, set the training seed now.
216 |     if args.seed is not None:
217 |         set_seed(args.seed)
218 | 
219 |     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
220 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
221 |     # (the dataset will be downloaded automatically from the datasets Hub).
222 |     #
223 |     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
224 |     # 'text' is found. You can easily tweak this behavior (see below).
225 |     #
226 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
227 |     # download the dataset.
228 |     if args.dataset_name is not None:
229 |         # Downloading and loading a dataset from the hub.
230 |         raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
231 |         if "validation" not in raw_datasets.keys():
232 |             raw_datasets["validation"] = load_dataset(
233 |                 args.dataset_name,
234 |                 args.dataset_config_name,
235 |                 split=f"train[:{args.validation_split_percentage}%]",
236 |             )
237 |             raw_datasets["train"] = load_dataset(
238 |                 args.dataset_name,
239 |                 args.dataset_config_name,
240 |                 split=f"train[{args.validation_split_percentage}%:]",
241 |             )
242 |     else:
243 |         data_files = {}
244 |         if args.train_file is not None:
245 |             data_files["train"] = args.train_file
246 |         if args.validation_file is not None:
247 |             data_files["validation"] = args.validation_file
248 |         extension = args.train_file.split(".")[-1]
249 |         if extension == "txt":
250 |             extension = "text"
251 |         raw_datasets = load_dataset(extension, data_files=data_files)
252 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
253 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
254 | 
255 |     # Load pretrained model and tokenizer
256 |     #
257 |     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
258 |     # download model & vocab.
259 |     if args.config_name:
260 |         config = AutoConfig.from_pretrained(args.config_name)
261 |     elif args.model_name_or_path:
262 |         config = AutoConfig.from_pretrained(args.model_name_or_path)
263 |     else:
264 |         config = CONFIG_MAPPING[args.model_type]()
265 |         logger.warning("You are instantiating a new config instance from scratch.")
266 | 
267 |     if args.tokenizer_name:
268 |         tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
269 |     elif args.model_name_or_path:
270 |         tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
271 |     else:
272 |         raise ValueError(
273 |             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
274 |             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
275 |         )
276 | 
277 |     if args.model_name_or_path:
278 |         model = AutoModelForCausalLM.from_pretrained(
279 |             args.model_name_or_path,
280 |             from_tf=bool(".ckpt" in args.model_name_or_path),
281 |             config=config,
282 |         )
283 |     else:
284 |         logger.info("Training new model from scratch")
285 |         model = AutoModelForCausalLM.from_config(config)
286 | 
287 |     model.resize_token_embeddings(len(tokenizer))
288 | 
289 |     # Preprocessing the datasets.
290 |     # First we tokenize all the texts.
291 |     column_names = raw_datasets["train"].column_names
292 |     text_column_name = "text" if "text" in column_names else column_names[0]
293 | 
294 |     def tokenize_function(examples):
295 |         return tokenizer(examples[text_column_name])
296 | 
297 |     tokenized_datasets = raw_datasets.map(
298 |         tokenize_function,
299 |         batched=True,
300 |         num_proc=args.preprocessing_num_workers,
301 |         remove_columns=column_names,
302 |         load_from_cache_file=not args.overwrite_cache,
303 |     )
304 | 
305 |     if args.block_size is None:
306 |         block_size = tokenizer.model_max_length
307 |         if block_size > 1024:
308 |             logger.warning(
309 |                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
310 |                 "Picking 1024 instead. You can change that default value by passing --block_size xxx."
311 |             )
312 |         block_size = 1024
313 |     else:
314 |         if args.block_size > tokenizer.model_max_length:
315 |             logger.warning(
316 |                 f"The block_size passed ({args.block_size}) is larger than the maximum length for the model"
317 |                 f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
318 |             )
319 |         block_size = min(args.block_size, tokenizer.model_max_length)
320 | 
321 |     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
322 |     def group_texts(examples):
323 |         # Concatenate all texts.
324 |         concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
325 |         total_length = len(concatenated_examples[list(examples.keys())[0]])
326 |         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
327 |         # customize this part to your needs.
328 |         total_length = (total_length // block_size) * block_size
329 |         # Split by chunks of max_len.
330 |         result = {
331 |             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
332 |             for k, t in concatenated_examples.items()
333 |         }
334 |         result["labels"] = result["input_ids"].copy()
335 |         return result
336 | 
337 |     # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
338 |     # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
339 |     # to preprocess.
340 |     #
341 |     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
342 |     # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
343 | 
344 |     lm_datasets = tokenized_datasets.map(
345 |         group_texts,
346 |         batched=True,
347 |         num_proc=args.preprocessing_num_workers,
348 |         load_from_cache_file=not args.overwrite_cache,
349 |     )
350 | 
351 |     train_dataset = lm_datasets["train"]
352 |     eval_dataset = lm_datasets["validation"]
353 | 
354 |     # Log a few random samples from the training set:
355 |     for index in random.sample(range(len(train_dataset)), 3):
356 |         logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
357 | 
358 |     # DataLoaders creation:
359 |     train_dataloader = DataLoader(
360 |         train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size
361 |     )
362 |     eval_dataloader = DataLoader(
363 |         eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size
364 |     )
365 | 
366 |     # Optimizer
367 |     # Split weights in two groups, one with weight decay and the other not.
368 |     no_decay = ["bias", "LayerNorm.weight"]
369 |     optimizer_grouped_parameters = [
370 |         {
371 |             "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
372 |             "weight_decay": args.weight_decay,
373 |         },
374 |         {
375 |             "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
376 |             "weight_decay": 0.0,
377 |         },
378 |     ]
379 |     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
380 | 
381 |     # Prepare everything with our `accelerator`.
382 |     model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
383 |         model, optimizer, train_dataloader, eval_dataloader
384 |     )
385 | 
386 |     # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
387 |     # shorter in multiprocess)
388 | 
389 |     # Scheduler and math around the number of training steps.
390 |     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
391 |     if args.max_train_steps is None:
392 |         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
393 |     else:
394 |         args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
395 | 
396 |     lr_scheduler = get_scheduler(
397 |         name=args.lr_scheduler_type,
398 |         optimizer=optimizer,
399 |         num_warmup_steps=args.num_warmup_steps,
400 |         num_training_steps=args.max_train_steps,
401 |     )
402 | 
403 |     # Train!
404 |     total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
405 | 
406 |     logger.info("***** Running training *****")
407 |     logger.info(f"  Num examples = {len(train_dataset)}")
408 |     logger.info(f"  Num Epochs = {args.num_train_epochs}")
409 |     logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
410 |     logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
411 |     logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
412 |     logger.info(f"  Total optimization steps = {args.max_train_steps}")
413 |     # Only show the progress bar once on each machine.
414 |     progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
415 |     completed_steps = 0
416 | 
417 |     for epoch in range(args.num_train_epochs):
418 |         model.train()
419 |         for step, batch in enumerate(train_dataloader):
420 |             outputs = model(**batch)
421 |             loss = outputs.loss
422 |             loss = loss / args.gradient_accumulation_steps
423 |             accelerator.backward(loss)
424 |             if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
425 |                 optimizer.step()
426 |                 lr_scheduler.step()
427 |                 optimizer.zero_grad()
428 |                 progress_bar.update(1)
429 |                 completed_steps += 1
430 | 
431 |             if completed_steps >= args.max_train_steps:
432 |                 break
433 | 
434 |         model.eval()
435 |         losses = []
436 |         for step, batch in enumerate(eval_dataloader):
437 |             with torch.no_grad():
438 |                 outputs = model(**batch)
439 | 
440 |             loss = outputs.loss
441 |             losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size)))
442 | 
443 |         losses = torch.cat(losses)
444 |         losses = losses[: len(eval_dataset)]
445 |         perplexity = math.exp(torch.mean(losses))
446 | 
447 |         logger.info(f"epoch {epoch}: perplexity: {perplexity}")
448 | 
449 |     if args.output_dir is not None:
450 |         accelerator.wait_for_everyone()
451 |         unwrapped_model = accelerator.unwrap_model(model)
452 |         unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
453 | 
454 | 
455 | if __name__ == "__main__":
456 |     main()
457 | 


--------------------------------------------------------------------------------
/run_mlm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Team All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) on a text file or a dataset.
 18 | 
 19 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 20 | https://huggingface.co/models?filter=masked-lm
 21 | """
 22 | # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
 23 | 
 24 | import logging
 25 | import math
 26 | import os
 27 | import sys
 28 | from dataclasses import dataclass, field
 29 | from typing import Optional
 30 | 
 31 | from datasets import load_dataset
 32 | 
 33 | import transformers
 34 | from transformers import (
 35 |     CONFIG_MAPPING,
 36 |     MODEL_FOR_MASKED_LM_MAPPING,
 37 |     AutoConfig,
 38 |     AutoModelForMaskedLM,
 39 |     AutoTokenizer,
 40 |     DataCollatorForLanguageModeling,
 41 |     HfArgumentParser,
 42 |     Trainer,
 43 |     TrainingArguments,
 44 |     set_seed,
 45 | )
 46 | from transformers.trainer_utils import get_last_checkpoint, is_main_process
 47 | from transformers.utils import check_min_version
 48 | 
 49 | 
 50 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 51 | check_min_version("4.7.0.dev0")
 52 | 
 53 | logger = logging.getLogger(__name__)
 54 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
 55 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 56 | 
 57 | 
 58 | @dataclass
 59 | class ModelArguments:
 60 |     """
 61 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
 62 |     """
 63 | 
 64 |     model_name_or_path: Optional[str] = field(
 65 |         default=None,
 66 |         metadata={
 67 |             "help": "The model checkpoint for weights initialization."
 68 |             "Don't set if you want to train a model from scratch."
 69 |         },
 70 |     )
 71 |     model_type: Optional[str] = field(
 72 |         default=None,
 73 |         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
 74 |     )
 75 |     config_name: Optional[str] = field(
 76 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
 77 |     )
 78 |     tokenizer_name: Optional[str] = field(
 79 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
 80 |     )
 81 |     cache_dir: Optional[str] = field(
 82 |         default=None,
 83 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
 84 |     )
 85 |     use_fast_tokenizer: bool = field(
 86 |         default=True,
 87 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
 88 |     )
 89 |     model_revision: str = field(
 90 |         default="main",
 91 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
 92 |     )
 93 |     use_auth_token: bool = field(
 94 |         default=False,
 95 |         metadata={
 96 |             "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
 97 |             "with private models)."
 98 |         },
 99 |     )
100 | 
101 | 
102 | @dataclass
103 | class DataTrainingArguments:
104 |     """
105 |     Arguments pertaining to what data we are going to input our model for training and eval.
106 |     """
107 | 
108 |     dataset_name: Optional[str] = field(
109 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
110 |     )
111 |     dataset_config_name: Optional[str] = field(
112 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
113 |     )
114 |     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
115 |     validation_file: Optional[str] = field(
116 |         default=None,
117 |         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
118 |     )
119 |     overwrite_cache: bool = field(
120 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
121 |     )
122 |     validation_split_percentage: Optional[int] = field(
123 |         default=5,
124 |         metadata={
125 |             "help": "The percentage of the train set used as validation set in case there's no validation split"
126 |         },
127 |     )
128 |     max_seq_length: Optional[int] = field(
129 |         default=None,
130 |         metadata={
131 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
132 |             "than this will be truncated."
133 |         },
134 |     )
135 |     preprocessing_num_workers: Optional[int] = field(
136 |         default=None,
137 |         metadata={"help": "The number of processes to use for the preprocessing."},
138 |     )
139 |     mlm_probability: float = field(
140 |         default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
141 |     )
142 |     line_by_line: bool = field(
143 |         default=False,
144 |         metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
145 |     )
146 |     pad_to_max_length: bool = field(
147 |         default=False,
148 |         metadata={
149 |             "help": "Whether to pad all samples to `max_seq_length`. "
150 |             "If False, will pad the samples dynamically when batching to the maximum length in the batch."
151 |         },
152 |     )
153 |     max_train_samples: Optional[int] = field(
154 |         default=None,
155 |         metadata={
156 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
157 |             "value if set."
158 |         },
159 |     )
160 |     max_eval_samples: Optional[int] = field(
161 |         default=None,
162 |         metadata={
163 |             "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
164 |             "value if set."
165 |         },
166 |     )
167 | 
168 |     def __post_init__(self):
169 |         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
170 |             raise ValueError("Need either a dataset name or a training/validation file.")
171 |         else:
172 |             if self.train_file is not None:
173 |                 extension = self.train_file.split(".")[-1]
174 |                 assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
175 |             if self.validation_file is not None:
176 |                 extension = self.validation_file.split(".")[-1]
177 |                 assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
178 | 
179 | 
180 | def main():
181 |     # See all possible arguments in src/transformers/training_args.py
182 |     # or by passing the --help flag to this script.
183 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
184 | 
185 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
186 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
187 |         # If we pass only one argument to the script and it's the path to a json file,
188 |         # let's parse it to get our arguments.
189 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
190 |     else:
191 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
192 | 
193 |     # Detecting last checkpoint.
194 |     last_checkpoint = None
195 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
196 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
197 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
198 |             raise ValueError(
199 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
200 |                 "Use --overwrite_output_dir to overcome."
201 |             )
202 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
203 |             logger.info(
204 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
205 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
206 |             )
207 | 
208 |     # Setup logging
209 |     logging.basicConfig(
210 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
211 |         datefmt="%m/%d/%Y %H:%M:%S",
212 |         handlers=[logging.StreamHandler(sys.stdout)],
213 |     )
214 |     logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
215 | 
216 |     # Log on each process the small summary:
217 |     logger.warning(
218 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
219 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
220 |     )
221 |     # Set the verbosity to info of the Transformers logger (on main process only):
222 |     if is_main_process(training_args.local_rank):
223 |         transformers.utils.logging.set_verbosity_info()
224 |         transformers.utils.logging.enable_default_handler()
225 |         transformers.utils.logging.enable_explicit_format()
226 |     logger.info(f"Training/evaluation parameters {training_args}")
227 | 
228 |     # Set seed before initializing model.
229 |     set_seed(training_args.seed)
230 | 
231 |     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
232 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
233 |     # (the dataset will be downloaded automatically from the datasets Hub
234 |     #
235 |     # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this
236 |     # behavior (see below)
237 |     #
238 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
239 |     # download the dataset.
240 |     if data_args.dataset_name is not None:
241 |         # Downloading and loading a dataset from the hub.
242 |         datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
243 |         if "validation" not in datasets.keys():
244 |             datasets["validation"] = load_dataset(
245 |                 data_args.dataset_name,
246 |                 data_args.dataset_config_name,
247 |                 split=f"train[:{data_args.validation_split_percentage}%]",
248 |                 cache_dir=model_args.cache_dir,
249 |             )
250 |             datasets["train"] = load_dataset(
251 |                 data_args.dataset_name,
252 |                 data_args.dataset_config_name,
253 |                 split=f"train[{data_args.validation_split_percentage}%:]",
254 |                 cache_dir=model_args.cache_dir,
255 |             )
256 |     else:
257 |         data_files = {}
258 |         if data_args.train_file is not None:
259 |             data_files["train"] = data_args.train_file
260 |         if data_args.validation_file is not None:
261 |             data_files["validation"] = data_args.validation_file
262 |         extension = data_args.train_file.split(".")[-1]
263 |         if extension == "txt":
264 |             extension = "text"
265 |         datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
266 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
267 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
268 | 
269 |     # Load pretrained model and tokenizer
270 |     #
271 |     # Distributed training:
272 |     # The .from_pretrained methods guarantee that only one local process can concurrently
273 |     # download model & vocab.
274 |     config_kwargs = {
275 |         "cache_dir": model_args.cache_dir,
276 |         "revision": model_args.model_revision,
277 |         "use_auth_token": True if model_args.use_auth_token else None,
278 |     }
279 |     if model_args.config_name:
280 |         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
281 |     elif model_args.model_name_or_path:
282 |         config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
283 |     else:
284 |         config = CONFIG_MAPPING[model_args.model_type]()
285 |         logger.warning("You are instantiating a new config instance from scratch.")
286 | 
287 |     tokenizer_kwargs = {
288 |         "cache_dir": model_args.cache_dir,
289 |         "use_fast": model_args.use_fast_tokenizer,
290 |         "revision": model_args.model_revision,
291 |         "use_auth_token": True if model_args.use_auth_token else None,
292 |     }
293 |     if model_args.tokenizer_name:
294 |         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
295 |     elif model_args.model_name_or_path:
296 |         tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
297 |     else:
298 |         raise ValueError(
299 |             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
300 |             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
301 |         )
302 | 
303 |     if model_args.model_name_or_path:
304 |         model = AutoModelForMaskedLM.from_pretrained(
305 |             model_args.model_name_or_path,
306 |             from_tf=bool(".ckpt" in model_args.model_name_or_path),
307 |             config=config,
308 |             cache_dir=model_args.cache_dir,
309 |             revision=model_args.model_revision,
310 |             use_auth_token=True if model_args.use_auth_token else None,
311 |         )
312 |     else:
313 |         logger.info("Training new model from scratch")
314 |         model = AutoModelForMaskedLM.from_config(config)
315 | 
316 |     model.resize_token_embeddings(len(tokenizer))
317 | 
318 |     # Preprocessing the datasets.
319 |     # First we tokenize all the texts.
320 |     if training_args.do_train:
321 |         column_names = datasets["train"].column_names
322 |     else:
323 |         column_names = datasets["validation"].column_names
324 |     text_column_name = "text" if "text" in column_names else column_names[0]
325 | 
326 |     if data_args.max_seq_length is None:
327 |         max_seq_length = tokenizer.model_max_length
328 |         if max_seq_length > 1024:
329 |             logger.warning(
330 |                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
331 |                 "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
332 |             )
333 |             max_seq_length = 1024
334 |     else:
335 |         if data_args.max_seq_length > tokenizer.model_max_length:
336 |             logger.warning(
337 |                 f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
338 |                 f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
339 |             )
340 |         max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
341 | 
342 |     if data_args.line_by_line:
343 |         # When using line_by_line, we just tokenize each nonempty line.
344 |         padding = "max_length" if data_args.pad_to_max_length else False
345 | 
346 |         def tokenize_function(examples):
347 |             # Remove empty lines
348 |             examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
349 |             return tokenizer(
350 |                 examples["text"],
351 |                 padding=padding,
352 |                 truncation=True,
353 |                 max_length=max_seq_length,
354 |                 # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
355 |                 # receives the `special_tokens_mask`.
356 |                 return_special_tokens_mask=True,
357 |             )
358 | 
359 |         tokenized_datasets = datasets.map(
360 |             tokenize_function,
361 |             batched=True,
362 |             num_proc=data_args.preprocessing_num_workers,
363 |             remove_columns=[text_column_name],
364 |             load_from_cache_file=not data_args.overwrite_cache,
365 |         )
366 |     else:
367 |         # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
368 |         # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
369 |         # efficient when it receives the `special_tokens_mask`.
370 |         def tokenize_function(examples):
371 |             return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
372 | 
373 |         tokenized_datasets = datasets.map(
374 |             tokenize_function,
375 |             batched=True,
376 |             num_proc=data_args.preprocessing_num_workers,
377 |             remove_columns=column_names,
378 |             load_from_cache_file=not data_args.overwrite_cache,
379 |         )
380 | 
381 |         # Main data processing function that will concatenate all texts from our dataset and generate chunks of
382 |         # max_seq_length.
383 |         def group_texts(examples):
384 |             # Concatenate all texts.
385 |             concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
386 |             total_length = len(concatenated_examples[list(examples.keys())[0]])
387 |             # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
388 |             # customize this part to your needs.
389 |             total_length = (total_length // max_seq_length) * max_seq_length
390 |             # Split by chunks of max_len.
391 |             result = {
392 |                 k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
393 |                 for k, t in concatenated_examples.items()
394 |             }
395 |             return result
396 | 
397 |         # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
398 |         # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
399 |         # might be slower to preprocess.
400 |         #
401 |         # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
402 |         # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
403 | 
404 |         tokenized_datasets = tokenized_datasets.map(
405 |             group_texts,
406 |             batched=True,
407 |             num_proc=data_args.preprocessing_num_workers,
408 |             load_from_cache_file=not data_args.overwrite_cache,
409 |         )
410 | 
411 |     if training_args.do_train:
412 |         if "train" not in tokenized_datasets:
413 |             raise ValueError("--do_train requires a train dataset")
414 |         train_dataset = tokenized_datasets["train"]
415 |         if data_args.max_train_samples is not None:
416 |             train_dataset = train_dataset.select(range(data_args.max_train_samples))
417 | 
418 |     if training_args.do_eval:
419 |         if "validation" not in tokenized_datasets:
420 |             raise ValueError("--do_eval requires a validation dataset")
421 |         eval_dataset = tokenized_datasets["validation"]
422 |         if data_args.max_eval_samples is not None:
423 |             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
424 | 
425 |     # Data collator
426 |     # This one will take care of randomly masking the tokens.
427 |     pad_to_multiple_of_8 = data_args.line_by_line and training_args.fp16 and not data_args.pad_to_max_length
428 |     data_collator = DataCollatorForLanguageModeling(
429 |         tokenizer=tokenizer,
430 |         mlm_probability=data_args.mlm_probability,
431 |         pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
432 |     )
433 | 
434 |     # Initialize our Trainer
435 |     trainer = Trainer(
436 |         model=model,
437 |         args=training_args,
438 |         train_dataset=train_dataset if training_args.do_train else None,
439 |         eval_dataset=eval_dataset if training_args.do_eval else None,
440 |         tokenizer=tokenizer,
441 |         data_collator=data_collator,
442 |     )
443 | 
444 |     # Training
445 |     if training_args.do_train:
446 |         checkpoint = None
447 |         if training_args.resume_from_checkpoint is not None:
448 |             checkpoint = training_args.resume_from_checkpoint
449 |         elif last_checkpoint is not None:
450 |             checkpoint = last_checkpoint
451 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
452 |         trainer.save_model()  # Saves the tokenizer too for easy upload
453 |         metrics = train_result.metrics
454 | 
455 |         max_train_samples = (
456 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
457 |         )
458 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
459 | 
460 |         trainer.log_metrics("train", metrics)
461 |         trainer.save_metrics("train", metrics)
462 |         trainer.save_state()
463 | 
464 |     # Evaluation
465 |     if training_args.do_eval:
466 |         logger.info("*** Evaluate ***")
467 | 
468 |         metrics = trainer.evaluate()
469 | 
470 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
471 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
472 |         perplexity = math.exp(metrics["eval_loss"])
473 |         metrics["perplexity"] = perplexity
474 | 
475 |         trainer.log_metrics("eval", metrics)
476 |         trainer.save_metrics("eval", metrics)
477 | 
478 |     if training_args.push_to_hub:
479 |         kwargs = {"finetuned_from": model_args.model_name_or_path, "tags": "fill-mask"}
480 |         if data_args.dataset_name is not None:
481 |             kwargs["dataset_tags"] = data_args.dataset_name
482 |             if data_args.dataset_config_name is not None:
483 |                 kwargs["dataset_args"] = data_args.dataset_config_name
484 |                 kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
485 |             else:
486 |                 kwargs["dataset"] = data_args.dataset_name
487 | 
488 |         trainer.push_to_hub(**kwargs)
489 | 
490 | 
491 | def _mp_fn(index):
492 |     # For xla_spawn (TPUs)
493 |     main()
494 | 
495 | 
496 | if __name__ == "__main__":
497 |     main()
498 | 


--------------------------------------------------------------------------------
/run_mlm_no_trainer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...)
 18 | on a text file or a dataset without using HuggingFace Trainer.
 19 | 
 20 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 21 | https://huggingface.co/models?filter=masked-lm
 22 | """
 23 | # You can also adapt this script on your own mlm task. Pointers for this are left as comments.
 24 | 
 25 | import argparse
 26 | import logging
 27 | import math
 28 | import os
 29 | import random
 30 | 
 31 | import datasets
 32 | import torch
 33 | from datasets import load_dataset
 34 | from torch.utils.data.dataloader import DataLoader
 35 | from tqdm.auto import tqdm
 36 | 
 37 | import transformers
 38 | from accelerate import Accelerator
 39 | from transformers import (
 40 |     CONFIG_MAPPING,
 41 |     MODEL_MAPPING,
 42 |     AdamW,
 43 |     AutoConfig,
 44 |     AutoModelForMaskedLM,
 45 |     AutoTokenizer,
 46 |     DataCollatorForLanguageModeling,
 47 |     SchedulerType,
 48 |     get_scheduler,
 49 |     set_seed,
 50 | )
 51 | 
 52 | 
 53 | logger = logging.getLogger(__name__)
 54 | MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
 55 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 56 | 
 57 | 
 58 | def parse_args():
 59 |     parser = argparse.ArgumentParser(description="Finetune a transformers model on a Masked Language Modeling task")
 60 |     parser.add_argument(
 61 |         "--dataset_name",
 62 |         type=str,
 63 |         default=None,
 64 |         help="The name of the dataset to use (via the datasets library).",
 65 |     )
 66 |     parser.add_argument(
 67 |         "--dataset_config_name",
 68 |         type=str,
 69 |         default=None,
 70 |         help="The configuration name of the dataset to use (via the datasets library).",
 71 |     )
 72 |     parser.add_argument(
 73 |         "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
 74 |     )
 75 |     parser.add_argument(
 76 |         "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
 77 |     )
 78 |     parser.add_argument(
 79 |         "--validation_split_percentage",
 80 |         default=5,
 81 |         help="The percentage of the train set used as validation set in case there's no validation split",
 82 |     )
 83 |     parser.add_argument(
 84 |         "--pad_to_max_length",
 85 |         action="store_true",
 86 |         help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
 87 |     )
 88 |     parser.add_argument(
 89 |         "--model_name_or_path",
 90 |         type=str,
 91 |         help="Path to pretrained model or model identifier from huggingface.co/models.",
 92 |         required=True,
 93 |     )
 94 |     parser.add_argument(
 95 |         "--config_name",
 96 |         type=str,
 97 |         default=None,
 98 |         help="Pretrained config name or path if not the same as model_name",
 99 |     )
100 |     parser.add_argument(
101 |         "--tokenizer_name",
102 |         type=str,
103 |         default=None,
104 |         help="Pretrained tokenizer name or path if not the same as model_name",
105 |     )
106 |     parser.add_argument(
107 |         "--use_slow_tokenizer",
108 |         action="store_true",
109 |         help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
110 |     )
111 |     parser.add_argument(
112 |         "--per_device_train_batch_size",
113 |         type=int,
114 |         default=8,
115 |         help="Batch size (per device) for the training dataloader.",
116 |     )
117 |     parser.add_argument(
118 |         "--per_device_eval_batch_size",
119 |         type=int,
120 |         default=8,
121 |         help="Batch size (per device) for the evaluation dataloader.",
122 |     )
123 |     parser.add_argument(
124 |         "--learning_rate",
125 |         type=float,
126 |         default=5e-5,
127 |         help="Initial learning rate (after the potential warmup period) to use.",
128 |     )
129 |     parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
130 |     parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
131 |     parser.add_argument(
132 |         "--max_train_steps",
133 |         type=int,
134 |         default=None,
135 |         help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
136 |     )
137 |     parser.add_argument(
138 |         "--gradient_accumulation_steps",
139 |         type=int,
140 |         default=1,
141 |         help="Number of updates steps to accumulate before performing a backward/update pass.",
142 |     )
143 |     parser.add_argument(
144 |         "--lr_scheduler_type",
145 |         type=SchedulerType,
146 |         default="linear",
147 |         help="The scheduler type to use.",
148 |         choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
149 |     )
150 |     parser.add_argument(
151 |         "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
152 |     )
153 |     parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
154 |     parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
155 |     parser.add_argument(
156 |         "--model_type",
157 |         type=str,
158 |         default=None,
159 |         help="Model type to use if training from scratch.",
160 |         choices=MODEL_TYPES,
161 |     )
162 |     parser.add_argument(
163 |         "--max_seq_length",
164 |         type=int,
165 |         default=None,
166 |         help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated.",
167 |     )
168 |     parser.add_argument(
169 |         "--line_by_line",
170 |         type=bool,
171 |         default=False,
172 |         help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
173 |     )
174 |     parser.add_argument(
175 |         "--preprocessing_num_workers",
176 |         type=int,
177 |         default=None,
178 |         help="The number of processes to use for the preprocessing.",
179 |     )
180 |     parser.add_argument(
181 |         "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
182 |     )
183 |     parser.add_argument(
184 |         "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
185 |     )
186 | 
187 |     args = parser.parse_args()
188 | 
189 |     # Sanity checks
190 |     if args.dataset_name is None and args.train_file is None and args.validation_file is None:
191 |         raise ValueError("Need either a dataset name or a training/validation file.")
192 |     else:
193 |         if args.train_file is not None:
194 |             extension = args.train_file.split(".")[-1]
195 |             assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
196 |         if args.validation_file is not None:
197 |             extension = args.validation_file.split(".")[-1]
198 |             assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
199 | 
200 |     if args.output_dir is not None:
201 |         os.makedirs(args.output_dir, exist_ok=True)
202 | 
203 |     return args
204 | 
205 | 
206 | def main():
207 |     args = parse_args()
208 | 
209 |     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
210 |     accelerator = Accelerator()
211 |     # Make one log on every process with the configuration for debugging.
212 |     logging.basicConfig(
213 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
214 |         datefmt="%m/%d/%Y %H:%M:%S",
215 |         level=logging.INFO,
216 |     )
217 |     logger.info(accelerator.state)
218 | 
219 |     # Setup logging, we only want one process per machine to log things on the screen.
220 |     # accelerator.is_local_main_process is only True for one process per machine.
221 |     logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
222 |     if accelerator.is_local_main_process:
223 |         datasets.utils.logging.set_verbosity_warning()
224 |         transformers.utils.logging.set_verbosity_info()
225 |     else:
226 |         datasets.utils.logging.set_verbosity_error()
227 |         transformers.utils.logging.set_verbosity_error()
228 | 
229 |     # If passed along, set the training seed now.
230 |     if args.seed is not None:
231 |         set_seed(args.seed)
232 | 
233 |     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
234 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
235 |     # (the dataset will be downloaded automatically from the datasets Hub).
236 |     #
237 |     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
238 |     # 'text' is found. You can easily tweak this behavior (see below).
239 |     #
240 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
241 |     # download the dataset.
242 |     if args.dataset_name is not None:
243 |         # Downloading and loading a dataset from the hub.
244 |         raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
245 |         if "validation" not in raw_datasets.keys():
246 |             raw_datasets["validation"] = load_dataset(
247 |                 args.dataset_name,
248 |                 args.dataset_config_name,
249 |                 split=f"train[:{args.validation_split_percentage}%]",
250 |             )
251 |             raw_datasets["train"] = load_dataset(
252 |                 args.dataset_name,
253 |                 args.dataset_config_name,
254 |                 split=f"train[{args.validation_split_percentage}%:]",
255 |             )
256 |     else:
257 |         data_files = {}
258 |         if args.train_file is not None:
259 |             data_files["train"] = args.train_file
260 |         if args.validation_file is not None:
261 |             data_files["validation"] = args.validation_file
262 |         extension = args.train_file.split(".")[-1]
263 |         if extension == "txt":
264 |             extension = "text"
265 |         raw_datasets = load_dataset(extension, data_files=data_files)
266 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
267 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
268 | 
269 |     # Load pretrained model and tokenizer
270 |     #
271 |     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
272 |     # download model & vocab.
273 |     if args.config_name:
274 |         config = AutoConfig.from_pretrained(args.config_name)
275 |     elif args.model_name_or_path:
276 |         config = AutoConfig.from_pretrained(args.model_name_or_path)
277 |     else:
278 |         config = CONFIG_MAPPING[args.model_type]()
279 |         logger.warning("You are instantiating a new config instance from scratch.")
280 | 
281 |     if args.tokenizer_name:
282 |         tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
283 |     elif args.model_name_or_path:
284 |         tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
285 |     else:
286 |         raise ValueError(
287 |             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
288 |             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
289 |         )
290 | 
291 |     if args.model_name_or_path:
292 |         model = AutoModelForMaskedLM.from_pretrained(
293 |             args.model_name_or_path,
294 |             from_tf=bool(".ckpt" in args.model_name_or_path),
295 |             config=config,
296 |         )
297 |     else:
298 |         logger.info("Training new model from scratch")
299 |         model = AutoModelForMaskedLM.from_config(config)
300 | 
301 |     model.resize_token_embeddings(len(tokenizer))
302 | 
303 |     # Preprocessing the datasets.
304 |     # First we tokenize all the texts.
305 |     column_names = raw_datasets["train"].column_names
306 |     text_column_name = "text" if "text" in column_names else column_names[0]
307 | 
308 |     if args.max_seq_length is None:
309 |         max_seq_length = tokenizer.model_max_length
310 |         if max_seq_length > 1024:
311 |             logger.warning(
312 |                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
313 |                 "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
314 |             )
315 |             max_seq_length = 1024
316 |     else:
317 |         if args.max_seq_length > tokenizer.model_max_length:
318 |             logger.warning(
319 |                 f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
320 |                 f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
321 |             )
322 |         max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)
323 | 
324 |     if args.line_by_line:
325 |         # When using line_by_line, we just tokenize each nonempty line.
326 |         padding = "max_length" if args.pad_to_max_length else False
327 | 
328 |         def tokenize_function(examples):
329 |             # Remove empty lines
330 |             examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
331 |             return tokenizer(
332 |                 examples["text"],
333 |                 padding=padding,
334 |                 truncation=True,
335 |                 max_length=max_seq_length,
336 |                 # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
337 |                 # receives the `special_tokens_mask`.
338 |                 return_special_tokens_mask=True,
339 |             )
340 | 
341 |         tokenized_datasets = raw_datasets.map(
342 |             tokenize_function,
343 |             batched=True,
344 |             num_proc=args.preprocessing_num_workers,
345 |             remove_columns=[text_column_name],
346 |             load_from_cache_file=not args.overwrite_cache,
347 |         )
348 |     else:
349 |         # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
350 |         # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
351 |         # efficient when it receives the `special_tokens_mask`.
352 |         def tokenize_function(examples):
353 |             return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
354 | 
355 |         tokenized_datasets = raw_datasets.map(
356 |             tokenize_function,
357 |             batched=True,
358 |             num_proc=args.preprocessing_num_workers,
359 |             remove_columns=column_names,
360 |             load_from_cache_file=not args.overwrite_cache,
361 |         )
362 | 
363 |         # Main data processing function that will concatenate all texts from our dataset and generate chunks of
364 |         # max_seq_length.
365 |         def group_texts(examples):
366 |             # Concatenate all texts.
367 |             concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
368 |             total_length = len(concatenated_examples[list(examples.keys())[0]])
369 |             # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
370 |             # customize this part to your needs.
371 |             total_length = (total_length // max_seq_length) * max_seq_length
372 |             # Split by chunks of max_len.
373 |             result = {
374 |                 k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
375 |                 for k, t in concatenated_examples.items()
376 |             }
377 |             return result
378 | 
379 |         # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
380 |         # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
381 |         # might be slower to preprocess.
382 |         #
383 |         # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
384 |         # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
385 | 
386 |         tokenized_datasets = tokenized_datasets.map(
387 |             group_texts,
388 |             batched=True,
389 |             num_proc=args.preprocessing_num_workers,
390 |             load_from_cache_file=not args.overwrite_cache,
391 |         )
392 | 
393 |     train_dataset = tokenized_datasets["train"]
394 |     eval_dataset = tokenized_datasets["validation"]
395 | 
396 |     # Log a few random samples from the training set:
397 |     for index in random.sample(range(len(train_dataset)), 3):
398 |         logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
399 | 
400 |     # Data collator
401 |     # This one will take care of randomly masking the tokens.
402 |     data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=args.mlm_probability)
403 | 
404 |     # DataLoaders creation:
405 |     train_dataloader = DataLoader(
406 |         train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
407 |     )
408 |     eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
409 | 
410 |     # Optimizer
411 |     # Split weights in two groups, one with weight decay and the other not.
412 |     no_decay = ["bias", "LayerNorm.weight"]
413 |     optimizer_grouped_parameters = [
414 |         {
415 |             "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
416 |             "weight_decay": args.weight_decay,
417 |         },
418 |         {
419 |             "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
420 |             "weight_decay": 0.0,
421 |         },
422 |     ]
423 |     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
424 | 
425 |     # Prepare everything with our `accelerator`.
426 |     model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
427 |         model, optimizer, train_dataloader, eval_dataloader
428 |     )
429 | 
430 |     # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
431 |     # shorter in multiprocess)
432 | 
433 |     # Scheduler and math around the number of training steps.
434 |     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
435 |     if args.max_train_steps is None:
436 |         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
437 |     else:
438 |         args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
439 | 
440 |     lr_scheduler = get_scheduler(
441 |         name=args.lr_scheduler_type,
442 |         optimizer=optimizer,
443 |         num_warmup_steps=args.num_warmup_steps,
444 |         num_training_steps=args.max_train_steps,
445 |     )
446 | 
447 |     # Train!
448 |     total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
449 | 
450 |     logger.info("***** Running training *****")
451 |     logger.info(f"  Num examples = {len(train_dataset)}")
452 |     logger.info(f"  Num Epochs = {args.num_train_epochs}")
453 |     logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
454 |     logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
455 |     logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
456 |     logger.info(f"  Total optimization steps = {args.max_train_steps}")
457 |     # Only show the progress bar once on each machine.
458 |     progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
459 |     completed_steps = 0
460 | 
461 |     for epoch in range(args.num_train_epochs):
462 |         model.train()
463 |         for step, batch in enumerate(train_dataloader):
464 |             outputs = model(**batch)
465 |             loss = outputs.loss
466 |             loss = loss / args.gradient_accumulation_steps
467 |             accelerator.backward(loss)
468 |             if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
469 |                 optimizer.step()
470 |                 lr_scheduler.step()
471 |                 optimizer.zero_grad()
472 |                 progress_bar.update(1)
473 |                 completed_steps += 1
474 | 
475 |             if completed_steps >= args.max_train_steps:
476 |                 break
477 | 
478 |         model.eval()
479 |         losses = []
480 |         for step, batch in enumerate(eval_dataloader):
481 |             with torch.no_grad():
482 |                 outputs = model(**batch)
483 | 
484 |             loss = outputs.loss
485 |             losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size)))
486 | 
487 |         losses = torch.cat(losses)
488 |         losses = losses[: len(eval_dataset)]
489 |         perplexity = math.exp(torch.mean(losses))
490 | 
491 |         logger.info(f"epoch {epoch}: perplexity: {perplexity}")
492 | 
493 |     if args.output_dir is not None:
494 |         accelerator.wait_for_everyone()
495 |         unwrapped_model = accelerator.unwrap_model(model)
496 |         unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
497 | 
498 | 
499 | if __name__ == "__main__":
500 |     main()
501 | 


--------------------------------------------------------------------------------
/run_plm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Team All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for permutation language modeling.
 18 | """
 19 | # You can also adapt this script on your own permutation language modeling task. Pointers for this are left as comments.
 20 | 
 21 | import logging
 22 | import math
 23 | import os
 24 | import sys
 25 | from dataclasses import dataclass, field
 26 | from typing import Optional
 27 | 
 28 | from datasets import load_dataset
 29 | 
 30 | import transformers
 31 | from transformers import (
 32 |     AutoConfig,
 33 |     AutoTokenizer,
 34 |     DataCollatorForPermutationLanguageModeling,
 35 |     HfArgumentParser,
 36 |     Trainer,
 37 |     TrainingArguments,
 38 |     XLNetConfig,
 39 |     XLNetLMHeadModel,
 40 |     set_seed,
 41 | )
 42 | from transformers.trainer_utils import get_last_checkpoint, is_main_process
 43 | from transformers.utils import check_min_version
 44 | 
 45 | 
 46 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 47 | check_min_version("4.7.0.dev0")
 48 | 
 49 | logger = logging.getLogger(__name__)
 50 | 
 51 | 
 52 | @dataclass
 53 | class ModelArguments:
 54 |     """
 55 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
 56 |     """
 57 | 
 58 |     model_name_or_path: Optional[str] = field(
 59 |         default=None,
 60 |         metadata={
 61 |             "help": "The model checkpoint for weights initialization."
 62 |             "Don't set if you want to train a model from scratch."
 63 |         },
 64 |     )
 65 |     config_name: Optional[str] = field(
 66 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
 67 |     )
 68 |     tokenizer_name: Optional[str] = field(
 69 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
 70 |     )
 71 |     cache_dir: Optional[str] = field(
 72 |         default=None,
 73 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
 74 |     )
 75 |     use_fast_tokenizer: bool = field(
 76 |         default=True,
 77 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
 78 |     )
 79 |     model_revision: str = field(
 80 |         default="main",
 81 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
 82 |     )
 83 |     use_auth_token: bool = field(
 84 |         default=False,
 85 |         metadata={
 86 |             "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
 87 |             "with private models)."
 88 |         },
 89 |     )
 90 | 
 91 | 
 92 | @dataclass
 93 | class DataTrainingArguments:
 94 |     """
 95 |     Arguments pertaining to what data we are going to input our model for training and eval.
 96 |     """
 97 | 
 98 |     dataset_name: Optional[str] = field(
 99 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
100 |     )
101 |     dataset_config_name: Optional[str] = field(
102 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
103 |     )
104 |     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
105 |     validation_file: Optional[str] = field(
106 |         default=None,
107 |         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
108 |     )
109 |     overwrite_cache: bool = field(
110 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
111 |     )
112 |     validation_split_percentage: Optional[int] = field(
113 |         default=5,
114 |         metadata={
115 |             "help": "The percentage of the train set used as validation set in case there's no validation split"
116 |         },
117 |     )
118 |     max_seq_length: int = field(
119 |         default=512,
120 |         metadata={
121 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
122 |             "than this will be truncated."
123 |         },
124 |     )
125 |     preprocessing_num_workers: Optional[int] = field(
126 |         default=None,
127 |         metadata={"help": "The number of processes to use for the preprocessing."},
128 |     )
129 |     plm_probability: float = field(
130 |         default=1 / 6,
131 |         metadata={
132 |             "help": "Ratio of length of a span of masked tokens to surrounding context length for "
133 |             "permutation language modeling."
134 |         },
135 |     )
136 |     max_span_length: int = field(
137 |         default=5, metadata={"help": "Maximum length of a span of masked tokens for permutation language modeling."}
138 |     )
139 |     line_by_line: bool = field(
140 |         default=False,
141 |         metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
142 |     )
143 |     pad_to_max_length: bool = field(
144 |         default=False,
145 |         metadata={
146 |             "help": "Whether to pad all samples to `max_seq_length`. "
147 |             "If False, will pad the samples dynamically when batching to the maximum length in the batch."
148 |         },
149 |     )
150 |     max_train_samples: Optional[int] = field(
151 |         default=None,
152 |         metadata={
153 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
154 |             "value if set."
155 |         },
156 |     )
157 |     max_eval_samples: Optional[int] = field(
158 |         default=None,
159 |         metadata={
160 |             "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
161 |             "value if set."
162 |         },
163 |     )
164 | 
165 |     def __post_init__(self):
166 |         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
167 |             raise ValueError("Need either a dataset name or a training/validation file.")
168 |         else:
169 |             if self.train_file is not None:
170 |                 extension = self.train_file.split(".")[-1]
171 |                 assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
172 |             if self.validation_file is not None:
173 |                 extension = self.validation_file.split(".")[-1]
174 |                 assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
175 | 
176 | 
177 | def main():
178 |     # See all possible arguments in src/transformers/training_args.py
179 |     # or by passing the --help flag to this script.
180 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
181 | 
182 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
183 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
184 |         # If we pass only one argument to the script and it's the path to a json file,
185 |         # let's parse it to get our arguments.
186 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
187 |     else:
188 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
189 | 
190 |     # Detecting last checkpoint.
191 |     last_checkpoint = None
192 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
193 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
194 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
195 |             raise ValueError(
196 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
197 |                 "Use --overwrite_output_dir to overcome."
198 |             )
199 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
200 |             logger.info(
201 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
202 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
203 |             )
204 | 
205 |     # Setup logging
206 |     logging.basicConfig(
207 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
208 |         datefmt="%m/%d/%Y %H:%M:%S",
209 |         handlers=[logging.StreamHandler(sys.stdout)],
210 |     )
211 |     logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
212 | 
213 |     # Log on each process the small summary:
214 |     logger.warning(
215 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
216 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
217 |     )
218 |     # Set the verbosity to info of the Transformers logger (on main process only):
219 |     if is_main_process(training_args.local_rank):
220 |         transformers.utils.logging.set_verbosity_info()
221 |         transformers.utils.logging.enable_default_handler()
222 |         transformers.utils.logging.enable_explicit_format()
223 |     logger.info(f"Training/evaluation parameters {training_args}")
224 | 
225 |     # Set seed before initializing model.
226 |     set_seed(training_args.seed)
227 | 
228 |     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
229 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
230 |     # (the dataset will be downloaded automatically from the datasets Hub).
231 |     #
232 |     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
233 |     # 'text' is found. You can easily tweak this behavior (see below).
234 |     #
235 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
236 |     # download the dataset.
237 |     if data_args.dataset_name is not None:
238 |         # Downloading and loading a dataset from the hub.
239 |         datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
240 |         if "validation" not in datasets.keys():
241 |             datasets["validation"] = load_dataset(
242 |                 data_args.dataset_name,
243 |                 data_args.dataset_config_name,
244 |                 split=f"train[:{data_args.validation_split_percentage}%]",
245 |                 cache_dir=model_args.cache_dir,
246 |             )
247 |             datasets["train"] = load_dataset(
248 |                 data_args.dataset_name,
249 |                 data_args.dataset_config_name,
250 |                 split=f"train[{data_args.validation_split_percentage}%:]",
251 |                 cache_dir=model_args.cache_dir,
252 |             )
253 |     else:
254 |         data_files = {}
255 |         if data_args.train_file is not None:
256 |             data_files["train"] = data_args.train_file
257 |         if data_args.validation_file is not None:
258 |             data_files["validation"] = data_args.validation_file
259 |         extension = data_args.train_file.split(".")[-1]
260 |         if extension == "txt":
261 |             extension = "text"
262 |         datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
263 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
264 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
265 | 
266 |     # Load pretrained model and tokenizer
267 |     #
268 |     # Distributed training:
269 |     # The .from_pretrained methods guarantee that only one local process can concurrently
270 |     # download model & vocab.
271 |     config_kwargs = {
272 |         "cache_dir": model_args.cache_dir,
273 |         "revision": model_args.model_revision,
274 |         "use_auth_token": True if model_args.use_auth_token else None,
275 |     }
276 |     if model_args.config_name:
277 |         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
278 |     elif model_args.model_name_or_path:
279 |         config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
280 |     else:
281 |         config = XLNetConfig()
282 |         logger.warning("You are instantiating a new config instance from scratch.")
283 | 
284 |     tokenizer_kwargs = {
285 |         "cache_dir": model_args.cache_dir,
286 |         "use_fast": model_args.use_fast_tokenizer,
287 |         "revision": model_args.model_revision,
288 |         "use_auth_token": True if model_args.use_auth_token else None,
289 |     }
290 |     if model_args.tokenizer_name:
291 |         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
292 |     elif model_args.model_name_or_path:
293 |         tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
294 |     else:
295 |         raise ValueError(
296 |             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
297 |             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
298 |         )
299 | 
300 |     if model_args.model_name_or_path:
301 |         model = XLNetLMHeadModel.from_pretrained(
302 |             model_args.model_name_or_path,
303 |             from_tf=bool(".ckpt" in model_args.model_name_or_path),
304 |             config=config,
305 |             cache_dir=model_args.cache_dir,
306 |             revision=model_args.model_revision,
307 |             use_auth_token=True if model_args.use_auth_token else None,
308 |         )
309 |     else:
310 |         logger.info("Training new model from scratch")
311 |         model = XLNetLMHeadModel.from_config(config)
312 | 
313 |     model.resize_token_embeddings(len(tokenizer))
314 | 
315 |     # Preprocessing the datasets.
316 |     # First we tokenize all the texts.
317 |     if training_args.do_train:
318 |         column_names = datasets["train"].column_names
319 |     else:
320 |         column_names = datasets["validation"].column_names
321 |     text_column_name = "text" if "text" in column_names else column_names[0]
322 | 
323 |     if data_args.max_seq_length > tokenizer.model_max_length:
324 |         logger.warning(
325 |             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
326 |             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
327 |         )
328 |     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
329 | 
330 |     if data_args.line_by_line:
331 |         # When using line_by_line, we just tokenize each nonempty line.
332 |         padding = "max_length" if data_args.pad_to_max_length else False
333 | 
334 |         def tokenize_function(examples):
335 |             # Remove empty lines
336 |             examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
337 |             return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)
338 | 
339 |         tokenized_datasets = datasets.map(
340 |             tokenize_function,
341 |             batched=True,
342 |             num_proc=data_args.preprocessing_num_workers,
343 |             remove_columns=[text_column_name],
344 |             load_from_cache_file=not data_args.overwrite_cache,
345 |         )
346 |     else:
347 |         # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
348 |         def tokenize_function(examples):
349 |             return tokenizer(examples[text_column_name])
350 | 
351 |         tokenized_datasets = datasets.map(
352 |             tokenize_function,
353 |             batched=True,
354 |             num_proc=data_args.preprocessing_num_workers,
355 |             remove_columns=column_names,
356 |             load_from_cache_file=not data_args.overwrite_cache,
357 |         )
358 | 
359 |         # Main data processing function that will concatenate all texts from our dataset and generate chunks of
360 |         # max_seq_length.
361 |         def group_texts(examples):
362 |             # Concatenate all texts.
363 |             concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
364 |             total_length = len(concatenated_examples[list(examples.keys())[0]])
365 |             # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
366 |             # customize this part to your needs.
367 |             total_length = (total_length // max_seq_length) * max_seq_length
368 |             # Split by chunks of max_len.
369 |             result = {
370 |                 k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
371 |                 for k, t in concatenated_examples.items()
372 |             }
373 |             return result
374 | 
375 |         # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
376 |         # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
377 |         # might be slower to preprocess.
378 |         #
379 |         # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
380 |         # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
381 | 
382 |         tokenized_datasets = tokenized_datasets.map(
383 |             group_texts,
384 |             batched=True,
385 |             num_proc=data_args.preprocessing_num_workers,
386 |             load_from_cache_file=not data_args.overwrite_cache,
387 |         )
388 | 
389 |     if training_args.do_train:
390 |         if "train" not in tokenized_datasets:
391 |             raise ValueError("--do_train requires a train dataset")
392 |         train_dataset = tokenized_datasets["train"]
393 |         if data_args.max_train_samples is not None:
394 |             train_dataset = train_dataset.select(range(data_args.max_train_samples))
395 | 
396 |     if training_args.do_eval:
397 |         if "validation" not in tokenized_datasets:
398 |             raise ValueError("--do_eval requires a validation dataset")
399 |         eval_dataset = tokenized_datasets["validation"]
400 |         if data_args.max_eval_samples is not None:
401 |             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
402 | 
403 |     # Data collator
404 |     data_collator = DataCollatorForPermutationLanguageModeling(
405 |         tokenizer=tokenizer,
406 |         plm_probability=data_args.plm_probability,
407 |         max_span_length=data_args.max_span_length,
408 |     )
409 | 
410 |     # Initialize our Trainer
411 |     trainer = Trainer(
412 |         model=model,
413 |         args=training_args,
414 |         train_dataset=train_dataset if training_args.do_train else None,
415 |         eval_dataset=eval_dataset if training_args.do_eval else None,
416 |         tokenizer=tokenizer,
417 |         data_collator=data_collator,
418 |     )
419 | 
420 |     # Training
421 |     if training_args.do_train:
422 |         checkpoint = None
423 |         if training_args.resume_from_checkpoint is not None:
424 |             checkpoint = training_args.resume_from_checkpoint
425 |         elif last_checkpoint is not None:
426 |             checkpoint = last_checkpoint
427 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
428 |         trainer.save_model()  # Saves the tokenizer too for easy upload
429 |         metrics = train_result.metrics
430 | 
431 |         max_train_samples = (
432 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
433 |         )
434 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
435 | 
436 |         trainer.log_metrics("train", metrics)
437 |         trainer.save_metrics("train", metrics)
438 |         trainer.save_state()
439 | 
440 |     # Evaluation
441 |     if training_args.do_eval:
442 |         logger.info("*** Evaluate ***")
443 | 
444 |         metrics = trainer.evaluate()
445 | 
446 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
447 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
448 |         perplexity = math.exp(metrics["eval_loss"])
449 |         metrics["perplexity"] = perplexity
450 | 
451 |         trainer.log_metrics("eval", metrics)
452 |         trainer.save_metrics("eval", metrics)
453 | 
454 |     if training_args.push_to_hub:
455 |         kwargs = {"finetuned_from": model_args.model_name_or_path, "tags": "language-modeling"}
456 |         if data_args.dataset_name is not None:
457 |             kwargs["dataset_tags"] = data_args.dataset_name
458 |             if data_args.dataset_config_name is not None:
459 |                 kwargs["dataset_args"] = data_args.dataset_config_name
460 |                 kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
461 |             else:
462 |                 kwargs["dataset"] = data_args.dataset_name
463 | 
464 |         trainer.push_to_hub(**kwargs)
465 | 
466 | 
467 | def _mp_fn(index):
468 |     # For xla_spawn (TPUs)
469 |     main()
470 | 
471 | 
472 | if __name__ == "__main__":
473 |     main()
474 | 


--------------------------------------------------------------------------------
/zero3_gpu_run_mlm.sh:
--------------------------------------------------------------------------------
 1 | # export NCCL_IB_DISABLE=1
 2 | export BS=32
 3 | export NCCL_DEBUG=INFO
 4 | 
 5 | deepspeed run_mlm.py \
 6 | --seed 42 \
 7 | --model_type bert \
 8 | --tokenizer_name beomi/KcELECTRA-base \
 9 | --train_file ./sampled_20190101_20200611_v2.txt \
10 | --num_train_epochs 2 \
11 | --per_device_train_batch_size $BS \
12 | --per_device_eval_batch_size $BS \
13 | --do_train \
14 | --output_dir ./test-bert-zero3 \
15 | --fp16 \
16 | --logging_first_step \
17 | --max_seq_length 300 \
18 | --deepspeed ./ds_zero3_1gpu.json  


--------------------------------------------------------------------------------