├── .gitignore ├── README.md ├── ds_zero2_1gpu.json ├── ds_zero3_1gpu.json ├── gpu_run_mlm.sh ├── requirements.txt ├── run_clm.py ├── run_clm_no_trainer.py ├── run_mlm.py ├── run_mlm_no_trainer.py ├── run_plm.py ├── sampled_20190101_20200611_v2.txt └── zero3_gpu_run_mlm.sh /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/linux,python,visualstudiocode,macos 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=linux,python,visualstudiocode,macos 4 | 5 | ### Linux ### 6 | *~ 7 | 8 | # temporary files which can be created if a process still has a handle open of a deleted file 9 | .fuse_hidden* 10 | 11 | # KDE directory preferences 12 | .directory 13 | 14 | # Linux trash folder which might appear on any partition or disk 15 | .Trash-* 16 | 17 | # .nfs files are created when an open file is removed but is still being accessed 18 | .nfs* 19 | 20 | ### macOS ### 21 | # General 22 | .DS_Store 23 | .AppleDouble 24 | .LSOverride 25 | 26 | # Icon must end with two \r 27 | Icon 28 | 29 | 30 | # Thumbnails 31 | ._* 32 | 33 | # Files that might appear in the root of a volume 34 | .DocumentRevisions-V100 35 | .fseventsd 36 | .Spotlight-V100 37 | .TemporaryItems 38 | .Trashes 39 | .VolumeIcon.icns 40 | .com.apple.timemachine.donotpresent 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | 49 | ### Python ### 50 | # Byte-compiled / optimized / DLL files 51 | __pycache__/ 52 | *.py[cod] 53 | *$py.class 54 | 55 | # C extensions 56 | *.so 57 | 58 | # Distribution / packaging 59 | .Python 60 | build/ 61 | develop-eggs/ 62 | dist/ 63 | downloads/ 64 | eggs/ 65 | .eggs/ 66 | parts/ 67 | sdist/ 68 | var/ 69 | wheels/ 70 | pip-wheel-metadata/ 71 | share/python-wheels/ 72 | *.egg-info/ 73 | .installed.cfg 74 | *.egg 75 | MANIFEST 76 | 77 | # PyInstaller 78 | # Usually these files are written by a python script from a template 79 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 80 | *.manifest 81 | *.spec 82 | 83 | # Installer logs 84 | pip-log.txt 85 | pip-delete-this-directory.txt 86 | 87 | # Unit test / coverage reports 88 | htmlcov/ 89 | .tox/ 90 | .nox/ 91 | .coverage 92 | .coverage.* 93 | .cache 94 | nosetests.xml 95 | coverage.xml 96 | *.cover 97 | *.py,cover 98 | .hypothesis/ 99 | .pytest_cache/ 100 | pytestdebug.log 101 | 102 | # Translations 103 | *.mo 104 | *.pot 105 | 106 | # Django stuff: 107 | *.log 108 | local_settings.py 109 | db.sqlite3 110 | db.sqlite3-journal 111 | 112 | # Flask stuff: 113 | instance/ 114 | .webassets-cache 115 | 116 | # Scrapy stuff: 117 | .scrapy 118 | 119 | # Sphinx documentation 120 | docs/_build/ 121 | doc/_build/ 122 | 123 | # PyBuilder 124 | target/ 125 | 126 | # Jupyter Notebook 127 | .ipynb_checkpoints 128 | 129 | # IPython 130 | profile_default/ 131 | ipython_config.py 132 | 133 | # pyenv 134 | .python-version 135 | 136 | # pipenv 137 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 138 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 139 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 140 | # install all needed dependencies. 141 | #Pipfile.lock 142 | 143 | # poetry 144 | #poetry.lock 145 | 146 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 147 | __pypackages__/ 148 | 149 | # Celery stuff 150 | celerybeat-schedule 151 | celerybeat.pid 152 | 153 | # SageMath parsed files 154 | *.sage.py 155 | 156 | # Environments 157 | # .env 158 | .env/ 159 | .venv/ 160 | env/ 161 | venv/ 162 | ENV/ 163 | env.bak/ 164 | venv.bak/ 165 | pythonenv* 166 | 167 | # Spyder project settings 168 | .spyderproject 169 | .spyproject 170 | 171 | # Rope project settings 172 | .ropeproject 173 | 174 | # mkdocs documentation 175 | /site 176 | 177 | # mypy 178 | .mypy_cache/ 179 | .dmypy.json 180 | dmypy.json 181 | 182 | # Pyre type checker 183 | .pyre/ 184 | 185 | # pytype static type analyzer 186 | .pytype/ 187 | 188 | # operating system-related files 189 | # file properties cache/storage on macOS 190 | *.DS_Store 191 | # thumbnail cache on Windows 192 | Thumbs.db 193 | 194 | # profiling data 195 | .prof 196 | 197 | 198 | ### VisualStudioCode ### 199 | .vscode/ 200 | .vscode/* 201 | !.vscode/settings.json 202 | !.vscode/tasks.json 203 | !.vscode/launch.json 204 | !.vscode/extensions.json 205 | *.code-workspace 206 | 207 | ### VisualStudioCode Patch ### 208 | # Ignore all local history of files 209 | .history 210 | .ionide 211 | 212 | # End of https://www.toptal.com/developers/gitignore/api/linux,python,visualstudiocode,macos 213 | # 214 | .env 215 | runs/ 216 | test*/ 217 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | > Fork from https://github.com/huggingface/transformers/tree/86d5fb0b360e68de46d40265e7c707fe68c8015b/examples/pytorch/language-modeling at 2021.05.17. 2 | 3 | 4 | 19 | 20 | ## Language model training 21 | 22 | Fine-tuning (or training from scratch) the library models for language modeling on a text dataset for GPT, GPT-2, 23 | ALBERT, BERT, DistilBERT, RoBERTa, XLNet... GPT and GPT-2 are trained or fine-tuned using a causal language modeling 24 | (CLM) loss while ALBERT, BERT, DistilBERT and RoBERTa are trained or fine-tuned using a masked language modeling (MLM) 25 | loss. XLNet uses permutation language modeling (PLM), you can find more information about the differences between those 26 | objectives in our [model summary](https://huggingface.co/transformers/model_summary.html). 27 | 28 | There are two sets of scripts provided. The first set leverages the Trainer API. The second set with `no_trainer` in the suffix uses a custom training loop and leverages the 🤗 Accelerate library . Both sets use the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets. 29 | 30 | **Note:** The old script `run_language_modeling.py` is still available [here](https://github.com/huggingface/transformers/blob/master/examples/legacy/run_language_modeling.py). 31 | 32 | The following examples, will run on datasets hosted on our [hub](https://huggingface.co/datasets) or with your own 33 | text files for training and validation. We give examples of both below. 34 | 35 | ### GPT-2/GPT and causal language modeling 36 | 37 | The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before 38 | the tokenization). The loss here is that of causal language modeling. 39 | 40 | ```bash 41 | python run_clm.py \ 42 | --model_name_or_path gpt2 \ 43 | --dataset_name wikitext \ 44 | --dataset_config_name wikitext-2-raw-v1 \ 45 | --do_train \ 46 | --do_eval \ 47 | --output_dir /tmp/test-clm 48 | ``` 49 | 50 | This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches 51 | a score of ~20 perplexity once fine-tuned on the dataset. 52 | 53 | To run on your own training and validation files, use the following command: 54 | 55 | ```bash 56 | python run_clm.py \ 57 | --model_name_or_path gpt2 \ 58 | --train_file path_to_train_file \ 59 | --validation_file path_to_validation_file \ 60 | --do_train \ 61 | --do_eval \ 62 | --output_dir /tmp/test-clm 63 | ``` 64 | 65 | This uses the built in HuggingFace `Trainer` for training. If you want to use a custom training loop, you can utilize or adapt the `run_clm_no_trainer.py` script. Take a look at the script for a list of supported arguments. An example is shown below: 66 | 67 | ```bash 68 | python run_clm_no_trainer.py \ 69 | --dataset_name wikitext \ 70 | --dataset_config_name wikitext-2-raw-v1 \ 71 | --model_name_or_path gpt2 \ 72 | --output_dir /tmp/test-clm 73 | ``` 74 | 75 | ### RoBERTa/BERT/DistilBERT and masked language modeling 76 | 77 | The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different 78 | as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their 79 | pre-training: masked language modeling. 80 | 81 | In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore, 82 | converge slightly slower (over-fitting takes more epochs). 83 | 84 | ```bash 85 | python run_mlm.py \ 86 | --model_name_or_path roberta-base \ 87 | --dataset_name wikitext \ 88 | --dataset_config_name wikitext-2-raw-v1 \ 89 | --do_train \ 90 | --do_eval \ 91 | --output_dir /tmp/test-mlm 92 | ``` 93 | 94 | To run on your own training and validation files, use the following command: 95 | 96 | ```bash 97 | python run_mlm.py \ 98 | --model_name_or_path roberta-base \ 99 | --train_file path_to_train_file \ 100 | --validation_file path_to_validation_file \ 101 | --do_train \ 102 | --do_eval \ 103 | --output_dir /tmp/test-mlm 104 | ``` 105 | 106 | If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script 107 | concatenates all texts and then splits them in blocks of the same length). 108 | 109 | This uses the built in HuggingFace `Trainer` for training. If you want to use a custom training loop, you can utilize or adapt the `run_mlm_no_trainer.py` script. Take a look at the script for a list of supported arguments. An example is shown below: 110 | 111 | ```bash 112 | python run_mlm_no_trainer.py \ 113 | --dataset_name wikitext \ 114 | --dataset_config_name wikitext-2-raw-v1 \ 115 | --model_name_or_path roberta-base \ 116 | --output_dir /tmp/test-mlm 117 | ``` 118 | 119 | **Note:** On TPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make 120 | sure all your batches have the same length. 121 | 122 | ### Whole word masking 123 | 124 | This part was moved to `examples/research_projects/mlm_wwm`. 125 | 126 | ### XLNet and permutation language modeling 127 | 128 | XLNet uses a different training objective, which is permutation language modeling. It is an autoregressive method 129 | to learn bidirectional contexts by maximizing the expected likelihood over all permutations of the input 130 | sequence factorization order. 131 | 132 | We use the `--plm_probability` flag to define the ratio of length of a span of masked tokens to surrounding 133 | context length for permutation language modeling. 134 | 135 | The `--max_span_length` flag may also be used to limit the length of a span of masked tokens used 136 | for permutation language modeling. 137 | 138 | Here is how to fine-tune XLNet on wikitext-2: 139 | 140 | ```bash 141 | python run_plm.py \ 142 | --model_name_or_path=xlnet-base-cased \ 143 | --dataset_name wikitext \ 144 | --dataset_config_name wikitext-2-raw-v1 \ 145 | --do_train \ 146 | --do_eval \ 147 | --output_dir /tmp/test-plm 148 | ``` 149 | 150 | To fine-tune it on your own training and validation file, run: 151 | 152 | ```bash 153 | python run_plm.py \ 154 | --model_name_or_path=xlnet-base-cased \ 155 | --train_file path_to_train_file \ 156 | --validation_file path_to_validation_file \ 157 | --do_train \ 158 | --do_eval \ 159 | --output_dir /tmp/test-plm 160 | ``` 161 | 162 | If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script 163 | concatenates all texts and then splits them in blocks of the same length). 164 | 165 | **Note:** On TPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make 166 | sure all your batches have the same length. 167 | -------------------------------------------------------------------------------- /ds_zero2_1gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "betas": "auto", 15 | "eps": "auto", 16 | "weight_decay": "auto" 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "WarmupLR", 21 | "params": { 22 | "warmup_min_lr": "auto", 23 | "warmup_max_lr": "auto", 24 | "warmup_num_steps": "auto" 25 | } 26 | }, 27 | "zero_optimization": { 28 | "stage": 2, 29 | "allgather_partitions": true, 30 | "allgather_bucket_size": 2e8, 31 | "overlap_comm": true, 32 | "reduce_scatter": true, 33 | "reduce_bucket_size": 2e8, 34 | "contiguous_gradients": true, 35 | "cpu_offload": true 36 | }, 37 | "gradient_accumulation_steps": "auto", 38 | "gradient_clipping": "auto", 39 | "train_batch_size": "auto", 40 | "train_micro_batch_size_per_gpu": "auto" 41 | } -------------------------------------------------------------------------------- /ds_zero3_1gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "betas": "auto", 15 | "eps": "auto", 16 | "weight_decay": "auto" 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "WarmupLR", 21 | "params": { 22 | "warmup_min_lr": "auto", 23 | "warmup_max_lr": "auto", 24 | "warmup_num_steps": "auto" 25 | } 26 | }, 27 | "zero_optimization": { 28 | "stage": 3, 29 | "offload_optimizer": { 30 | "device": "cpu", 31 | "pin_memory": true 32 | }, 33 | "offload_param": { 34 | "device": "cpu", 35 | "pin_memory": true 36 | }, 37 | "overlap_comm": true, 38 | "contiguous_gradients": true, 39 | "sub_group_size": 1e14, 40 | "reduce_bucket_size": "auto", 41 | "stage3_prefetch_bucket_size": "auto", 42 | "stage3_param_persistence_threshold": "auto", 43 | "stage3_max_live_parameters": 1e9, 44 | "stage3_max_reuse_distance": 1e9, 45 | "stage3_gather_fp16_weights_on_model_save": true 46 | }, 47 | "gradient_accumulation_steps": "auto", 48 | "gradient_clipping": "auto", 49 | "steps_per_print": 2000, 50 | "train_batch_size": "auto", 51 | "train_micro_batch_size_per_gpu": "auto", 52 | "wall_clock_breakdown": false 53 | } -------------------------------------------------------------------------------- /gpu_run_mlm.sh: -------------------------------------------------------------------------------- 1 | rm -rf ./test-bert-zero2-multigpu 2 | 3 | export BS=32 4 | export NCCL_DEBUG=INFO 5 | export NCCL_SHM_DISABLE=1 6 | 7 | deepspeed run_mlm.py \ 8 | --seed 42 \ 9 | --model_type bert \ 10 | --tokenizer_name beomi/KcELECTRA-base \ 11 | --train_file ./sampled_20190101_20200611_v2.txt \ 12 | --num_train_epochs 2 \ 13 | --per_device_train_batch_size $BS \ 14 | --per_device_eval_batch_size $BS \ 15 | --do_train \ 16 | --output_dir ./test-bert-zero2-multigpu \ 17 | --fp16 \ 18 | --logging_first_step \ 19 | --max_seq_length 300 \ 20 | --deepspeed ./ds_zero2_1gpu.json \ 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.1.3 2 | sentencepiece != 0.1.92 3 | protobuf 4 | -------------------------------------------------------------------------------- /run_clm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. 18 | 19 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script: 20 | https://huggingface.co/models?filter=causal-lm 21 | """ 22 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. 23 | 24 | import logging 25 | import math 26 | import os 27 | import sys 28 | from dataclasses import dataclass, field 29 | from typing import Optional 30 | 31 | from datasets import load_dataset 32 | 33 | import transformers 34 | from transformers import ( 35 | CONFIG_MAPPING, 36 | MODEL_FOR_CAUSAL_LM_MAPPING, 37 | AutoConfig, 38 | AutoModelForCausalLM, 39 | AutoTokenizer, 40 | HfArgumentParser, 41 | Trainer, 42 | TrainingArguments, 43 | default_data_collator, 44 | set_seed, 45 | ) 46 | from transformers.testing_utils import CaptureLogger 47 | from transformers.trainer_utils import get_last_checkpoint, is_main_process 48 | from transformers.utils import check_min_version 49 | 50 | 51 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks. 52 | check_min_version("4.7.0.dev0") 53 | 54 | logger = logging.getLogger(__name__) 55 | 56 | 57 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) 58 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 59 | 60 | 61 | @dataclass 62 | class ModelArguments: 63 | """ 64 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 65 | """ 66 | 67 | model_name_or_path: Optional[str] = field( 68 | default=None, 69 | metadata={ 70 | "help": "The model checkpoint for weights initialization." 71 | "Don't set if you want to train a model from scratch." 72 | }, 73 | ) 74 | model_type: Optional[str] = field( 75 | default=None, 76 | metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, 77 | ) 78 | config_name: Optional[str] = field( 79 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 80 | ) 81 | tokenizer_name: Optional[str] = field( 82 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 83 | ) 84 | cache_dir: Optional[str] = field( 85 | default=None, 86 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 87 | ) 88 | use_fast_tokenizer: bool = field( 89 | default=True, 90 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 91 | ) 92 | model_revision: str = field( 93 | default="main", 94 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 95 | ) 96 | use_auth_token: bool = field( 97 | default=False, 98 | metadata={ 99 | "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " 100 | "with private models)." 101 | }, 102 | ) 103 | 104 | 105 | @dataclass 106 | class DataTrainingArguments: 107 | """ 108 | Arguments pertaining to what data we are going to input our model for training and eval. 109 | """ 110 | 111 | dataset_name: Optional[str] = field( 112 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} 113 | ) 114 | dataset_config_name: Optional[str] = field( 115 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 116 | ) 117 | train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) 118 | validation_file: Optional[str] = field( 119 | default=None, 120 | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, 121 | ) 122 | max_train_samples: Optional[int] = field( 123 | default=None, 124 | metadata={ 125 | "help": "For debugging purposes or quicker training, truncate the number of training examples to this " 126 | "value if set." 127 | }, 128 | ) 129 | max_eval_samples: Optional[int] = field( 130 | default=None, 131 | metadata={ 132 | "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 133 | "value if set." 134 | }, 135 | ) 136 | 137 | block_size: Optional[int] = field( 138 | default=None, 139 | metadata={ 140 | "help": "Optional input sequence length after tokenization. " 141 | "The training dataset will be truncated in block of this size for training. " 142 | "Default to the model max input length for single sentence inputs (take into account special tokens)." 143 | }, 144 | ) 145 | overwrite_cache: bool = field( 146 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 147 | ) 148 | validation_split_percentage: Optional[int] = field( 149 | default=5, 150 | metadata={ 151 | "help": "The percentage of the train set used as validation set in case there's no validation split" 152 | }, 153 | ) 154 | preprocessing_num_workers: Optional[int] = field( 155 | default=None, 156 | metadata={"help": "The number of processes to use for the preprocessing."}, 157 | ) 158 | 159 | def __post_init__(self): 160 | if self.dataset_name is None and self.train_file is None and self.validation_file is None: 161 | raise ValueError("Need either a dataset name or a training/validation file.") 162 | else: 163 | if self.train_file is not None: 164 | extension = self.train_file.split(".")[-1] 165 | assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." 166 | if self.validation_file is not None: 167 | extension = self.validation_file.split(".")[-1] 168 | assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." 169 | 170 | 171 | def main(): 172 | # See all possible arguments in src/transformers/training_args.py 173 | # or by passing the --help flag to this script. 174 | # We now keep distinct sets of args, for a cleaner separation of concerns. 175 | 176 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 177 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 178 | # If we pass only one argument to the script and it's the path to a json file, 179 | # let's parse it to get our arguments. 180 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 181 | else: 182 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 183 | 184 | # Detecting last checkpoint. 185 | last_checkpoint = None 186 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 187 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 188 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 189 | raise ValueError( 190 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 191 | "Use --overwrite_output_dir to overcome." 192 | ) 193 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 194 | logger.info( 195 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 196 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 197 | ) 198 | 199 | # Setup logging 200 | logging.basicConfig( 201 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 202 | datefmt="%m/%d/%Y %H:%M:%S", 203 | handlers=[logging.StreamHandler(sys.stdout)], 204 | ) 205 | logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) 206 | 207 | # Log on each process the small summary: 208 | logger.warning( 209 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 210 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 211 | ) 212 | # Set the verbosity to info of the Transformers logger (on main process only): 213 | if is_main_process(training_args.local_rank): 214 | transformers.utils.logging.set_verbosity_info() 215 | transformers.utils.logging.enable_default_handler() 216 | transformers.utils.logging.enable_explicit_format() 217 | logger.info(f"Training/evaluation parameters {training_args}") 218 | 219 | # Set seed before initializing model. 220 | set_seed(training_args.seed) 221 | 222 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) 223 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ 224 | # (the dataset will be downloaded automatically from the datasets Hub). 225 | # 226 | # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called 227 | # 'text' is found. You can easily tweak this behavior (see below). 228 | # 229 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 230 | # download the dataset. 231 | if data_args.dataset_name is not None: 232 | # Downloading and loading a dataset from the hub. 233 | datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) 234 | if "validation" not in datasets.keys(): 235 | datasets["validation"] = load_dataset( 236 | data_args.dataset_name, 237 | data_args.dataset_config_name, 238 | split=f"train[:{data_args.validation_split_percentage}%]", 239 | cache_dir=model_args.cache_dir, 240 | ) 241 | datasets["train"] = load_dataset( 242 | data_args.dataset_name, 243 | data_args.dataset_config_name, 244 | split=f"train[{data_args.validation_split_percentage}%:]", 245 | cache_dir=model_args.cache_dir, 246 | ) 247 | else: 248 | data_files = {} 249 | if data_args.train_file is not None: 250 | data_files["train"] = data_args.train_file 251 | if data_args.validation_file is not None: 252 | data_files["validation"] = data_args.validation_file 253 | extension = ( 254 | data_args.train_file.split(".")[-1] 255 | if data_args.train_file is not None 256 | else data_args.validation_file.split(".")[-1] 257 | ) 258 | if extension == "txt": 259 | extension = "text" 260 | datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) 261 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at 262 | # https://huggingface.co/docs/datasets/loading_datasets.html. 263 | 264 | # Load pretrained model and tokenizer 265 | # 266 | # Distributed training: 267 | # The .from_pretrained methods guarantee that only one local process can concurrently 268 | # download model & vocab. 269 | 270 | config_kwargs = { 271 | "cache_dir": model_args.cache_dir, 272 | "revision": model_args.model_revision, 273 | "use_auth_token": True if model_args.use_auth_token else None, 274 | } 275 | if model_args.config_name: 276 | config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) 277 | elif model_args.model_name_or_path: 278 | config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) 279 | else: 280 | config = CONFIG_MAPPING[model_args.model_type]() 281 | logger.warning("You are instantiating a new config instance from scratch.") 282 | 283 | tokenizer_kwargs = { 284 | "cache_dir": model_args.cache_dir, 285 | "use_fast": model_args.use_fast_tokenizer, 286 | "revision": model_args.model_revision, 287 | "use_auth_token": True if model_args.use_auth_token else None, 288 | } 289 | if model_args.tokenizer_name: 290 | tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) 291 | elif model_args.model_name_or_path: 292 | tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) 293 | else: 294 | raise ValueError( 295 | "You are instantiating a new tokenizer from scratch. This is not supported by this script." 296 | "You can do it from another script, save it, and load it from here, using --tokenizer_name." 297 | ) 298 | 299 | if model_args.model_name_or_path: 300 | model = AutoModelForCausalLM.from_pretrained( 301 | model_args.model_name_or_path, 302 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 303 | config=config, 304 | cache_dir=model_args.cache_dir, 305 | revision=model_args.model_revision, 306 | use_auth_token=True if model_args.use_auth_token else None, 307 | ) 308 | else: 309 | logger.info("Training new model from scratch") 310 | model = AutoModelForCausalLM.from_config(config) 311 | 312 | model.resize_token_embeddings(len(tokenizer)) 313 | 314 | # Preprocessing the datasets. 315 | # First we tokenize all the texts. 316 | if training_args.do_train: 317 | column_names = datasets["train"].column_names 318 | else: 319 | column_names = datasets["validation"].column_names 320 | text_column_name = "text" if "text" in column_names else column_names[0] 321 | 322 | # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function 323 | tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") 324 | 325 | def tokenize_function(examples): 326 | with CaptureLogger(tok_logger) as cl: 327 | output = tokenizer(examples[text_column_name]) 328 | # clm input could be much much longer than block_size 329 | if "Token indices sequence length is longer than the" in cl.out: 330 | tok_logger.warning( 331 | "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." 332 | ) 333 | return output 334 | 335 | tokenized_datasets = datasets.map( 336 | tokenize_function, 337 | batched=True, 338 | num_proc=data_args.preprocessing_num_workers, 339 | remove_columns=column_names, 340 | load_from_cache_file=not data_args.overwrite_cache, 341 | ) 342 | 343 | if data_args.block_size is None: 344 | block_size = tokenizer.model_max_length 345 | if block_size > 1024: 346 | logger.warning( 347 | f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " 348 | "Picking 1024 instead. You can change that default value by passing --block_size xxx." 349 | ) 350 | block_size = 1024 351 | else: 352 | if data_args.block_size > tokenizer.model_max_length: 353 | logger.warning( 354 | f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" 355 | f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." 356 | ) 357 | block_size = min(data_args.block_size, tokenizer.model_max_length) 358 | 359 | # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. 360 | def group_texts(examples): 361 | # Concatenate all texts. 362 | concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} 363 | total_length = len(concatenated_examples[list(examples.keys())[0]]) 364 | # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can 365 | # customize this part to your needs. 366 | total_length = (total_length // block_size) * block_size 367 | # Split by chunks of max_len. 368 | result = { 369 | k: [t[i : i + block_size] for i in range(0, total_length, block_size)] 370 | for k, t in concatenated_examples.items() 371 | } 372 | result["labels"] = result["input_ids"].copy() 373 | return result 374 | 375 | # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder 376 | # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower 377 | # to preprocess. 378 | # 379 | # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: 380 | # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map 381 | 382 | lm_datasets = tokenized_datasets.map( 383 | group_texts, 384 | batched=True, 385 | num_proc=data_args.preprocessing_num_workers, 386 | load_from_cache_file=not data_args.overwrite_cache, 387 | ) 388 | 389 | if training_args.do_train: 390 | if "train" not in tokenized_datasets: 391 | raise ValueError("--do_train requires a train dataset") 392 | train_dataset = lm_datasets["train"] 393 | if data_args.max_train_samples is not None: 394 | train_dataset = train_dataset.select(range(data_args.max_train_samples)) 395 | 396 | if training_args.do_eval: 397 | if "validation" not in tokenized_datasets: 398 | raise ValueError("--do_eval requires a validation dataset") 399 | eval_dataset = lm_datasets["validation"] 400 | if data_args.max_eval_samples is not None: 401 | eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) 402 | 403 | # Initialize our Trainer 404 | trainer = Trainer( 405 | model=model, 406 | args=training_args, 407 | train_dataset=train_dataset if training_args.do_train else None, 408 | eval_dataset=eval_dataset if training_args.do_eval else None, 409 | tokenizer=tokenizer, 410 | # Data collator will default to DataCollatorWithPadding, so we change it. 411 | data_collator=default_data_collator, 412 | ) 413 | 414 | # Training 415 | if training_args.do_train: 416 | checkpoint = None 417 | if training_args.resume_from_checkpoint is not None: 418 | checkpoint = training_args.resume_from_checkpoint 419 | elif last_checkpoint is not None: 420 | checkpoint = last_checkpoint 421 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 422 | trainer.save_model() # Saves the tokenizer too for easy upload 423 | 424 | metrics = train_result.metrics 425 | 426 | max_train_samples = ( 427 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 428 | ) 429 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 430 | 431 | trainer.log_metrics("train", metrics) 432 | trainer.save_metrics("train", metrics) 433 | trainer.save_state() 434 | 435 | # Evaluation 436 | if training_args.do_eval: 437 | logger.info("*** Evaluate ***") 438 | 439 | metrics = trainer.evaluate() 440 | 441 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 442 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 443 | perplexity = math.exp(metrics["eval_loss"]) 444 | metrics["perplexity"] = perplexity 445 | 446 | trainer.log_metrics("eval", metrics) 447 | trainer.save_metrics("eval", metrics) 448 | 449 | if training_args.push_to_hub: 450 | kwargs = {"finetuned_from": model_args.model_name_or_path, "tags": "text-generation"} 451 | if data_args.dataset_name is not None: 452 | kwargs["dataset_tags"] = data_args.dataset_name 453 | if data_args.dataset_config_name is not None: 454 | kwargs["dataset_args"] = data_args.dataset_config_name 455 | kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" 456 | else: 457 | kwargs["dataset"] = data_args.dataset_name 458 | 459 | trainer.push_to_hub(**kwargs) 460 | 461 | 462 | def _mp_fn(index): 463 | # For xla_spawn (TPUs) 464 | main() 465 | 466 | 467 | if __name__ == "__main__": 468 | main() 469 | -------------------------------------------------------------------------------- /run_clm_no_trainer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2021 The HuggingFace Inc. team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) 18 | on a text file or a dataset without using HuggingFace Trainer. 19 | 20 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script: 21 | https://huggingface.co/models?filter=causal-lm 22 | """ 23 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. 24 | 25 | import argparse 26 | import logging 27 | import math 28 | import os 29 | import random 30 | 31 | import datasets 32 | import torch 33 | from datasets import load_dataset 34 | from torch.utils.data.dataloader import DataLoader 35 | from tqdm.auto import tqdm 36 | 37 | import transformers 38 | from accelerate import Accelerator 39 | from transformers import ( 40 | CONFIG_MAPPING, 41 | MODEL_MAPPING, 42 | AdamW, 43 | AutoConfig, 44 | AutoModelForCausalLM, 45 | AutoTokenizer, 46 | SchedulerType, 47 | default_data_collator, 48 | get_scheduler, 49 | set_seed, 50 | ) 51 | 52 | 53 | logger = logging.getLogger(__name__) 54 | MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) 55 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 56 | 57 | 58 | def parse_args(): 59 | parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task") 60 | parser.add_argument( 61 | "--dataset_name", 62 | type=str, 63 | default=None, 64 | help="The name of the dataset to use (via the datasets library).", 65 | ) 66 | parser.add_argument( 67 | "--dataset_config_name", 68 | type=str, 69 | default=None, 70 | help="The configuration name of the dataset to use (via the datasets library).", 71 | ) 72 | parser.add_argument( 73 | "--train_file", type=str, default=None, help="A csv or a json file containing the training data." 74 | ) 75 | parser.add_argument( 76 | "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data." 77 | ) 78 | parser.add_argument( 79 | "--validation_split_percentage", 80 | default=5, 81 | help="The percentage of the train set used as validation set in case there's no validation split", 82 | ) 83 | parser.add_argument( 84 | "--model_name_or_path", 85 | type=str, 86 | help="Path to pretrained model or model identifier from huggingface.co/models.", 87 | required=True, 88 | ) 89 | parser.add_argument( 90 | "--config_name", 91 | type=str, 92 | default=None, 93 | help="Pretrained config name or path if not the same as model_name", 94 | ) 95 | parser.add_argument( 96 | "--tokenizer_name", 97 | type=str, 98 | default=None, 99 | help="Pretrained tokenizer name or path if not the same as model_name", 100 | ) 101 | parser.add_argument( 102 | "--use_slow_tokenizer", 103 | action="store_true", 104 | help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).", 105 | ) 106 | parser.add_argument( 107 | "--per_device_train_batch_size", 108 | type=int, 109 | default=8, 110 | help="Batch size (per device) for the training dataloader.", 111 | ) 112 | parser.add_argument( 113 | "--per_device_eval_batch_size", 114 | type=int, 115 | default=8, 116 | help="Batch size (per device) for the evaluation dataloader.", 117 | ) 118 | parser.add_argument( 119 | "--learning_rate", 120 | type=float, 121 | default=5e-5, 122 | help="Initial learning rate (after the potential warmup period) to use.", 123 | ) 124 | parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.") 125 | parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.") 126 | parser.add_argument( 127 | "--max_train_steps", 128 | type=int, 129 | default=None, 130 | help="Total number of training steps to perform. If provided, overrides num_train_epochs.", 131 | ) 132 | parser.add_argument( 133 | "--gradient_accumulation_steps", 134 | type=int, 135 | default=1, 136 | help="Number of updates steps to accumulate before performing a backward/update pass.", 137 | ) 138 | parser.add_argument( 139 | "--lr_scheduler_type", 140 | type=SchedulerType, 141 | default="linear", 142 | help="The scheduler type to use.", 143 | choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"], 144 | ) 145 | parser.add_argument( 146 | "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler." 147 | ) 148 | parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.") 149 | parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") 150 | parser.add_argument( 151 | "--model_type", 152 | type=str, 153 | default=None, 154 | help="Model type to use if training from scratch.", 155 | choices=MODEL_TYPES, 156 | ) 157 | parser.add_argument( 158 | "--block_size", 159 | type=int, 160 | default=None, 161 | help="Optional input sequence length after tokenization. The training dataset will be truncated in block of this size for training. Default to the model max input length for single sentence inputs (take into account special tokens).", 162 | ) 163 | parser.add_argument( 164 | "--preprocessing_num_workers", 165 | type=int, 166 | default=None, 167 | help="The number of processes to use for the preprocessing.", 168 | ) 169 | parser.add_argument( 170 | "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets" 171 | ) 172 | 173 | args = parser.parse_args() 174 | 175 | # Sanity checks 176 | if args.dataset_name is None and args.train_file is None and args.validation_file is None: 177 | raise ValueError("Need either a dataset name or a training/validation file.") 178 | else: 179 | if args.train_file is not None: 180 | extension = args.train_file.split(".")[-1] 181 | assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file." 182 | if args.validation_file is not None: 183 | extension = args.validation_file.split(".")[-1] 184 | assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file." 185 | 186 | if args.output_dir is not None: 187 | os.makedirs(args.output_dir, exist_ok=True) 188 | 189 | return args 190 | 191 | 192 | def main(): 193 | args = parse_args() 194 | 195 | # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. 196 | accelerator = Accelerator() 197 | # Make one log on every process with the configuration for debugging. 198 | logging.basicConfig( 199 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 200 | datefmt="%m/%d/%Y %H:%M:%S", 201 | level=logging.INFO, 202 | ) 203 | logger.info(accelerator.state) 204 | 205 | # Setup logging, we only want one process per machine to log things on the screen. 206 | # accelerator.is_local_main_process is only True for one process per machine. 207 | logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR) 208 | if accelerator.is_local_main_process: 209 | datasets.utils.logging.set_verbosity_warning() 210 | transformers.utils.logging.set_verbosity_info() 211 | else: 212 | datasets.utils.logging.set_verbosity_error() 213 | transformers.utils.logging.set_verbosity_error() 214 | 215 | # If passed along, set the training seed now. 216 | if args.seed is not None: 217 | set_seed(args.seed) 218 | 219 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) 220 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ 221 | # (the dataset will be downloaded automatically from the datasets Hub). 222 | # 223 | # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called 224 | # 'text' is found. You can easily tweak this behavior (see below). 225 | # 226 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 227 | # download the dataset. 228 | if args.dataset_name is not None: 229 | # Downloading and loading a dataset from the hub. 230 | raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) 231 | if "validation" not in raw_datasets.keys(): 232 | raw_datasets["validation"] = load_dataset( 233 | args.dataset_name, 234 | args.dataset_config_name, 235 | split=f"train[:{args.validation_split_percentage}%]", 236 | ) 237 | raw_datasets["train"] = load_dataset( 238 | args.dataset_name, 239 | args.dataset_config_name, 240 | split=f"train[{args.validation_split_percentage}%:]", 241 | ) 242 | else: 243 | data_files = {} 244 | if args.train_file is not None: 245 | data_files["train"] = args.train_file 246 | if args.validation_file is not None: 247 | data_files["validation"] = args.validation_file 248 | extension = args.train_file.split(".")[-1] 249 | if extension == "txt": 250 | extension = "text" 251 | raw_datasets = load_dataset(extension, data_files=data_files) 252 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at 253 | # https://huggingface.co/docs/datasets/loading_datasets.html. 254 | 255 | # Load pretrained model and tokenizer 256 | # 257 | # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently 258 | # download model & vocab. 259 | if args.config_name: 260 | config = AutoConfig.from_pretrained(args.config_name) 261 | elif args.model_name_or_path: 262 | config = AutoConfig.from_pretrained(args.model_name_or_path) 263 | else: 264 | config = CONFIG_MAPPING[args.model_type]() 265 | logger.warning("You are instantiating a new config instance from scratch.") 266 | 267 | if args.tokenizer_name: 268 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer) 269 | elif args.model_name_or_path: 270 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer) 271 | else: 272 | raise ValueError( 273 | "You are instantiating a new tokenizer from scratch. This is not supported by this script." 274 | "You can do it from another script, save it, and load it from here, using --tokenizer_name." 275 | ) 276 | 277 | if args.model_name_or_path: 278 | model = AutoModelForCausalLM.from_pretrained( 279 | args.model_name_or_path, 280 | from_tf=bool(".ckpt" in args.model_name_or_path), 281 | config=config, 282 | ) 283 | else: 284 | logger.info("Training new model from scratch") 285 | model = AutoModelForCausalLM.from_config(config) 286 | 287 | model.resize_token_embeddings(len(tokenizer)) 288 | 289 | # Preprocessing the datasets. 290 | # First we tokenize all the texts. 291 | column_names = raw_datasets["train"].column_names 292 | text_column_name = "text" if "text" in column_names else column_names[0] 293 | 294 | def tokenize_function(examples): 295 | return tokenizer(examples[text_column_name]) 296 | 297 | tokenized_datasets = raw_datasets.map( 298 | tokenize_function, 299 | batched=True, 300 | num_proc=args.preprocessing_num_workers, 301 | remove_columns=column_names, 302 | load_from_cache_file=not args.overwrite_cache, 303 | ) 304 | 305 | if args.block_size is None: 306 | block_size = tokenizer.model_max_length 307 | if block_size > 1024: 308 | logger.warning( 309 | f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " 310 | "Picking 1024 instead. You can change that default value by passing --block_size xxx." 311 | ) 312 | block_size = 1024 313 | else: 314 | if args.block_size > tokenizer.model_max_length: 315 | logger.warning( 316 | f"The block_size passed ({args.block_size}) is larger than the maximum length for the model" 317 | f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." 318 | ) 319 | block_size = min(args.block_size, tokenizer.model_max_length) 320 | 321 | # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. 322 | def group_texts(examples): 323 | # Concatenate all texts. 324 | concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} 325 | total_length = len(concatenated_examples[list(examples.keys())[0]]) 326 | # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can 327 | # customize this part to your needs. 328 | total_length = (total_length // block_size) * block_size 329 | # Split by chunks of max_len. 330 | result = { 331 | k: [t[i : i + block_size] for i in range(0, total_length, block_size)] 332 | for k, t in concatenated_examples.items() 333 | } 334 | result["labels"] = result["input_ids"].copy() 335 | return result 336 | 337 | # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder 338 | # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower 339 | # to preprocess. 340 | # 341 | # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: 342 | # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map 343 | 344 | lm_datasets = tokenized_datasets.map( 345 | group_texts, 346 | batched=True, 347 | num_proc=args.preprocessing_num_workers, 348 | load_from_cache_file=not args.overwrite_cache, 349 | ) 350 | 351 | train_dataset = lm_datasets["train"] 352 | eval_dataset = lm_datasets["validation"] 353 | 354 | # Log a few random samples from the training set: 355 | for index in random.sample(range(len(train_dataset)), 3): 356 | logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") 357 | 358 | # DataLoaders creation: 359 | train_dataloader = DataLoader( 360 | train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size 361 | ) 362 | eval_dataloader = DataLoader( 363 | eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size 364 | ) 365 | 366 | # Optimizer 367 | # Split weights in two groups, one with weight decay and the other not. 368 | no_decay = ["bias", "LayerNorm.weight"] 369 | optimizer_grouped_parameters = [ 370 | { 371 | "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 372 | "weight_decay": args.weight_decay, 373 | }, 374 | { 375 | "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 376 | "weight_decay": 0.0, 377 | }, 378 | ] 379 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) 380 | 381 | # Prepare everything with our `accelerator`. 382 | model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( 383 | model, optimizer, train_dataloader, eval_dataloader 384 | ) 385 | 386 | # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be 387 | # shorter in multiprocess) 388 | 389 | # Scheduler and math around the number of training steps. 390 | num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) 391 | if args.max_train_steps is None: 392 | args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch 393 | else: 394 | args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) 395 | 396 | lr_scheduler = get_scheduler( 397 | name=args.lr_scheduler_type, 398 | optimizer=optimizer, 399 | num_warmup_steps=args.num_warmup_steps, 400 | num_training_steps=args.max_train_steps, 401 | ) 402 | 403 | # Train! 404 | total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps 405 | 406 | logger.info("***** Running training *****") 407 | logger.info(f" Num examples = {len(train_dataset)}") 408 | logger.info(f" Num Epochs = {args.num_train_epochs}") 409 | logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") 410 | logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") 411 | logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") 412 | logger.info(f" Total optimization steps = {args.max_train_steps}") 413 | # Only show the progress bar once on each machine. 414 | progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) 415 | completed_steps = 0 416 | 417 | for epoch in range(args.num_train_epochs): 418 | model.train() 419 | for step, batch in enumerate(train_dataloader): 420 | outputs = model(**batch) 421 | loss = outputs.loss 422 | loss = loss / args.gradient_accumulation_steps 423 | accelerator.backward(loss) 424 | if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: 425 | optimizer.step() 426 | lr_scheduler.step() 427 | optimizer.zero_grad() 428 | progress_bar.update(1) 429 | completed_steps += 1 430 | 431 | if completed_steps >= args.max_train_steps: 432 | break 433 | 434 | model.eval() 435 | losses = [] 436 | for step, batch in enumerate(eval_dataloader): 437 | with torch.no_grad(): 438 | outputs = model(**batch) 439 | 440 | loss = outputs.loss 441 | losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size))) 442 | 443 | losses = torch.cat(losses) 444 | losses = losses[: len(eval_dataset)] 445 | perplexity = math.exp(torch.mean(losses)) 446 | 447 | logger.info(f"epoch {epoch}: perplexity: {perplexity}") 448 | 449 | if args.output_dir is not None: 450 | accelerator.wait_for_everyone() 451 | unwrapped_model = accelerator.unwrap_model(model) 452 | unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) 453 | 454 | 455 | if __name__ == "__main__": 456 | main() 457 | -------------------------------------------------------------------------------- /run_mlm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2020 The HuggingFace Team All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) on a text file or a dataset. 18 | 19 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script: 20 | https://huggingface.co/models?filter=masked-lm 21 | """ 22 | # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments. 23 | 24 | import logging 25 | import math 26 | import os 27 | import sys 28 | from dataclasses import dataclass, field 29 | from typing import Optional 30 | 31 | from datasets import load_dataset 32 | 33 | import transformers 34 | from transformers import ( 35 | CONFIG_MAPPING, 36 | MODEL_FOR_MASKED_LM_MAPPING, 37 | AutoConfig, 38 | AutoModelForMaskedLM, 39 | AutoTokenizer, 40 | DataCollatorForLanguageModeling, 41 | HfArgumentParser, 42 | Trainer, 43 | TrainingArguments, 44 | set_seed, 45 | ) 46 | from transformers.trainer_utils import get_last_checkpoint, is_main_process 47 | from transformers.utils import check_min_version 48 | 49 | 50 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks. 51 | check_min_version("4.7.0.dev0") 52 | 53 | logger = logging.getLogger(__name__) 54 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys()) 55 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 56 | 57 | 58 | @dataclass 59 | class ModelArguments: 60 | """ 61 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 62 | """ 63 | 64 | model_name_or_path: Optional[str] = field( 65 | default=None, 66 | metadata={ 67 | "help": "The model checkpoint for weights initialization." 68 | "Don't set if you want to train a model from scratch." 69 | }, 70 | ) 71 | model_type: Optional[str] = field( 72 | default=None, 73 | metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, 74 | ) 75 | config_name: Optional[str] = field( 76 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 77 | ) 78 | tokenizer_name: Optional[str] = field( 79 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 80 | ) 81 | cache_dir: Optional[str] = field( 82 | default=None, 83 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 84 | ) 85 | use_fast_tokenizer: bool = field( 86 | default=True, 87 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 88 | ) 89 | model_revision: str = field( 90 | default="main", 91 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 92 | ) 93 | use_auth_token: bool = field( 94 | default=False, 95 | metadata={ 96 | "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " 97 | "with private models)." 98 | }, 99 | ) 100 | 101 | 102 | @dataclass 103 | class DataTrainingArguments: 104 | """ 105 | Arguments pertaining to what data we are going to input our model for training and eval. 106 | """ 107 | 108 | dataset_name: Optional[str] = field( 109 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} 110 | ) 111 | dataset_config_name: Optional[str] = field( 112 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 113 | ) 114 | train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) 115 | validation_file: Optional[str] = field( 116 | default=None, 117 | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, 118 | ) 119 | overwrite_cache: bool = field( 120 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 121 | ) 122 | validation_split_percentage: Optional[int] = field( 123 | default=5, 124 | metadata={ 125 | "help": "The percentage of the train set used as validation set in case there's no validation split" 126 | }, 127 | ) 128 | max_seq_length: Optional[int] = field( 129 | default=None, 130 | metadata={ 131 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 132 | "than this will be truncated." 133 | }, 134 | ) 135 | preprocessing_num_workers: Optional[int] = field( 136 | default=None, 137 | metadata={"help": "The number of processes to use for the preprocessing."}, 138 | ) 139 | mlm_probability: float = field( 140 | default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"} 141 | ) 142 | line_by_line: bool = field( 143 | default=False, 144 | metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."}, 145 | ) 146 | pad_to_max_length: bool = field( 147 | default=False, 148 | metadata={ 149 | "help": "Whether to pad all samples to `max_seq_length`. " 150 | "If False, will pad the samples dynamically when batching to the maximum length in the batch." 151 | }, 152 | ) 153 | max_train_samples: Optional[int] = field( 154 | default=None, 155 | metadata={ 156 | "help": "For debugging purposes or quicker training, truncate the number of training examples to this " 157 | "value if set." 158 | }, 159 | ) 160 | max_eval_samples: Optional[int] = field( 161 | default=None, 162 | metadata={ 163 | "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 164 | "value if set." 165 | }, 166 | ) 167 | 168 | def __post_init__(self): 169 | if self.dataset_name is None and self.train_file is None and self.validation_file is None: 170 | raise ValueError("Need either a dataset name or a training/validation file.") 171 | else: 172 | if self.train_file is not None: 173 | extension = self.train_file.split(".")[-1] 174 | assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." 175 | if self.validation_file is not None: 176 | extension = self.validation_file.split(".")[-1] 177 | assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." 178 | 179 | 180 | def main(): 181 | # See all possible arguments in src/transformers/training_args.py 182 | # or by passing the --help flag to this script. 183 | # We now keep distinct sets of args, for a cleaner separation of concerns. 184 | 185 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 186 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 187 | # If we pass only one argument to the script and it's the path to a json file, 188 | # let's parse it to get our arguments. 189 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 190 | else: 191 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 192 | 193 | # Detecting last checkpoint. 194 | last_checkpoint = None 195 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 196 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 197 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 198 | raise ValueError( 199 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 200 | "Use --overwrite_output_dir to overcome." 201 | ) 202 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 203 | logger.info( 204 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 205 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 206 | ) 207 | 208 | # Setup logging 209 | logging.basicConfig( 210 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 211 | datefmt="%m/%d/%Y %H:%M:%S", 212 | handlers=[logging.StreamHandler(sys.stdout)], 213 | ) 214 | logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) 215 | 216 | # Log on each process the small summary: 217 | logger.warning( 218 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 219 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 220 | ) 221 | # Set the verbosity to info of the Transformers logger (on main process only): 222 | if is_main_process(training_args.local_rank): 223 | transformers.utils.logging.set_verbosity_info() 224 | transformers.utils.logging.enable_default_handler() 225 | transformers.utils.logging.enable_explicit_format() 226 | logger.info(f"Training/evaluation parameters {training_args}") 227 | 228 | # Set seed before initializing model. 229 | set_seed(training_args.seed) 230 | 231 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) 232 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ 233 | # (the dataset will be downloaded automatically from the datasets Hub 234 | # 235 | # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this 236 | # behavior (see below) 237 | # 238 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 239 | # download the dataset. 240 | if data_args.dataset_name is not None: 241 | # Downloading and loading a dataset from the hub. 242 | datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) 243 | if "validation" not in datasets.keys(): 244 | datasets["validation"] = load_dataset( 245 | data_args.dataset_name, 246 | data_args.dataset_config_name, 247 | split=f"train[:{data_args.validation_split_percentage}%]", 248 | cache_dir=model_args.cache_dir, 249 | ) 250 | datasets["train"] = load_dataset( 251 | data_args.dataset_name, 252 | data_args.dataset_config_name, 253 | split=f"train[{data_args.validation_split_percentage}%:]", 254 | cache_dir=model_args.cache_dir, 255 | ) 256 | else: 257 | data_files = {} 258 | if data_args.train_file is not None: 259 | data_files["train"] = data_args.train_file 260 | if data_args.validation_file is not None: 261 | data_files["validation"] = data_args.validation_file 262 | extension = data_args.train_file.split(".")[-1] 263 | if extension == "txt": 264 | extension = "text" 265 | datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) 266 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at 267 | # https://huggingface.co/docs/datasets/loading_datasets.html. 268 | 269 | # Load pretrained model and tokenizer 270 | # 271 | # Distributed training: 272 | # The .from_pretrained methods guarantee that only one local process can concurrently 273 | # download model & vocab. 274 | config_kwargs = { 275 | "cache_dir": model_args.cache_dir, 276 | "revision": model_args.model_revision, 277 | "use_auth_token": True if model_args.use_auth_token else None, 278 | } 279 | if model_args.config_name: 280 | config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) 281 | elif model_args.model_name_or_path: 282 | config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) 283 | else: 284 | config = CONFIG_MAPPING[model_args.model_type]() 285 | logger.warning("You are instantiating a new config instance from scratch.") 286 | 287 | tokenizer_kwargs = { 288 | "cache_dir": model_args.cache_dir, 289 | "use_fast": model_args.use_fast_tokenizer, 290 | "revision": model_args.model_revision, 291 | "use_auth_token": True if model_args.use_auth_token else None, 292 | } 293 | if model_args.tokenizer_name: 294 | tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) 295 | elif model_args.model_name_or_path: 296 | tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) 297 | else: 298 | raise ValueError( 299 | "You are instantiating a new tokenizer from scratch. This is not supported by this script." 300 | "You can do it from another script, save it, and load it from here, using --tokenizer_name." 301 | ) 302 | 303 | if model_args.model_name_or_path: 304 | model = AutoModelForMaskedLM.from_pretrained( 305 | model_args.model_name_or_path, 306 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 307 | config=config, 308 | cache_dir=model_args.cache_dir, 309 | revision=model_args.model_revision, 310 | use_auth_token=True if model_args.use_auth_token else None, 311 | ) 312 | else: 313 | logger.info("Training new model from scratch") 314 | model = AutoModelForMaskedLM.from_config(config) 315 | 316 | model.resize_token_embeddings(len(tokenizer)) 317 | 318 | # Preprocessing the datasets. 319 | # First we tokenize all the texts. 320 | if training_args.do_train: 321 | column_names = datasets["train"].column_names 322 | else: 323 | column_names = datasets["validation"].column_names 324 | text_column_name = "text" if "text" in column_names else column_names[0] 325 | 326 | if data_args.max_seq_length is None: 327 | max_seq_length = tokenizer.model_max_length 328 | if max_seq_length > 1024: 329 | logger.warning( 330 | f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " 331 | "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." 332 | ) 333 | max_seq_length = 1024 334 | else: 335 | if data_args.max_seq_length > tokenizer.model_max_length: 336 | logger.warning( 337 | f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" 338 | f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." 339 | ) 340 | max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) 341 | 342 | if data_args.line_by_line: 343 | # When using line_by_line, we just tokenize each nonempty line. 344 | padding = "max_length" if data_args.pad_to_max_length else False 345 | 346 | def tokenize_function(examples): 347 | # Remove empty lines 348 | examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] 349 | return tokenizer( 350 | examples["text"], 351 | padding=padding, 352 | truncation=True, 353 | max_length=max_seq_length, 354 | # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it 355 | # receives the `special_tokens_mask`. 356 | return_special_tokens_mask=True, 357 | ) 358 | 359 | tokenized_datasets = datasets.map( 360 | tokenize_function, 361 | batched=True, 362 | num_proc=data_args.preprocessing_num_workers, 363 | remove_columns=[text_column_name], 364 | load_from_cache_file=not data_args.overwrite_cache, 365 | ) 366 | else: 367 | # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. 368 | # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more 369 | # efficient when it receives the `special_tokens_mask`. 370 | def tokenize_function(examples): 371 | return tokenizer(examples[text_column_name], return_special_tokens_mask=True) 372 | 373 | tokenized_datasets = datasets.map( 374 | tokenize_function, 375 | batched=True, 376 | num_proc=data_args.preprocessing_num_workers, 377 | remove_columns=column_names, 378 | load_from_cache_file=not data_args.overwrite_cache, 379 | ) 380 | 381 | # Main data processing function that will concatenate all texts from our dataset and generate chunks of 382 | # max_seq_length. 383 | def group_texts(examples): 384 | # Concatenate all texts. 385 | concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} 386 | total_length = len(concatenated_examples[list(examples.keys())[0]]) 387 | # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can 388 | # customize this part to your needs. 389 | total_length = (total_length // max_seq_length) * max_seq_length 390 | # Split by chunks of max_len. 391 | result = { 392 | k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] 393 | for k, t in concatenated_examples.items() 394 | } 395 | return result 396 | 397 | # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a 398 | # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value 399 | # might be slower to preprocess. 400 | # 401 | # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: 402 | # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map 403 | 404 | tokenized_datasets = tokenized_datasets.map( 405 | group_texts, 406 | batched=True, 407 | num_proc=data_args.preprocessing_num_workers, 408 | load_from_cache_file=not data_args.overwrite_cache, 409 | ) 410 | 411 | if training_args.do_train: 412 | if "train" not in tokenized_datasets: 413 | raise ValueError("--do_train requires a train dataset") 414 | train_dataset = tokenized_datasets["train"] 415 | if data_args.max_train_samples is not None: 416 | train_dataset = train_dataset.select(range(data_args.max_train_samples)) 417 | 418 | if training_args.do_eval: 419 | if "validation" not in tokenized_datasets: 420 | raise ValueError("--do_eval requires a validation dataset") 421 | eval_dataset = tokenized_datasets["validation"] 422 | if data_args.max_eval_samples is not None: 423 | eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) 424 | 425 | # Data collator 426 | # This one will take care of randomly masking the tokens. 427 | pad_to_multiple_of_8 = data_args.line_by_line and training_args.fp16 and not data_args.pad_to_max_length 428 | data_collator = DataCollatorForLanguageModeling( 429 | tokenizer=tokenizer, 430 | mlm_probability=data_args.mlm_probability, 431 | pad_to_multiple_of=8 if pad_to_multiple_of_8 else None, 432 | ) 433 | 434 | # Initialize our Trainer 435 | trainer = Trainer( 436 | model=model, 437 | args=training_args, 438 | train_dataset=train_dataset if training_args.do_train else None, 439 | eval_dataset=eval_dataset if training_args.do_eval else None, 440 | tokenizer=tokenizer, 441 | data_collator=data_collator, 442 | ) 443 | 444 | # Training 445 | if training_args.do_train: 446 | checkpoint = None 447 | if training_args.resume_from_checkpoint is not None: 448 | checkpoint = training_args.resume_from_checkpoint 449 | elif last_checkpoint is not None: 450 | checkpoint = last_checkpoint 451 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 452 | trainer.save_model() # Saves the tokenizer too for easy upload 453 | metrics = train_result.metrics 454 | 455 | max_train_samples = ( 456 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 457 | ) 458 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 459 | 460 | trainer.log_metrics("train", metrics) 461 | trainer.save_metrics("train", metrics) 462 | trainer.save_state() 463 | 464 | # Evaluation 465 | if training_args.do_eval: 466 | logger.info("*** Evaluate ***") 467 | 468 | metrics = trainer.evaluate() 469 | 470 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 471 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 472 | perplexity = math.exp(metrics["eval_loss"]) 473 | metrics["perplexity"] = perplexity 474 | 475 | trainer.log_metrics("eval", metrics) 476 | trainer.save_metrics("eval", metrics) 477 | 478 | if training_args.push_to_hub: 479 | kwargs = {"finetuned_from": model_args.model_name_or_path, "tags": "fill-mask"} 480 | if data_args.dataset_name is not None: 481 | kwargs["dataset_tags"] = data_args.dataset_name 482 | if data_args.dataset_config_name is not None: 483 | kwargs["dataset_args"] = data_args.dataset_config_name 484 | kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" 485 | else: 486 | kwargs["dataset"] = data_args.dataset_name 487 | 488 | trainer.push_to_hub(**kwargs) 489 | 490 | 491 | def _mp_fn(index): 492 | # For xla_spawn (TPUs) 493 | main() 494 | 495 | 496 | if __name__ == "__main__": 497 | main() 498 | -------------------------------------------------------------------------------- /run_mlm_no_trainer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2021 The HuggingFace Inc. team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) 18 | on a text file or a dataset without using HuggingFace Trainer. 19 | 20 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script: 21 | https://huggingface.co/models?filter=masked-lm 22 | """ 23 | # You can also adapt this script on your own mlm task. Pointers for this are left as comments. 24 | 25 | import argparse 26 | import logging 27 | import math 28 | import os 29 | import random 30 | 31 | import datasets 32 | import torch 33 | from datasets import load_dataset 34 | from torch.utils.data.dataloader import DataLoader 35 | from tqdm.auto import tqdm 36 | 37 | import transformers 38 | from accelerate import Accelerator 39 | from transformers import ( 40 | CONFIG_MAPPING, 41 | MODEL_MAPPING, 42 | AdamW, 43 | AutoConfig, 44 | AutoModelForMaskedLM, 45 | AutoTokenizer, 46 | DataCollatorForLanguageModeling, 47 | SchedulerType, 48 | get_scheduler, 49 | set_seed, 50 | ) 51 | 52 | 53 | logger = logging.getLogger(__name__) 54 | MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) 55 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 56 | 57 | 58 | def parse_args(): 59 | parser = argparse.ArgumentParser(description="Finetune a transformers model on a Masked Language Modeling task") 60 | parser.add_argument( 61 | "--dataset_name", 62 | type=str, 63 | default=None, 64 | help="The name of the dataset to use (via the datasets library).", 65 | ) 66 | parser.add_argument( 67 | "--dataset_config_name", 68 | type=str, 69 | default=None, 70 | help="The configuration name of the dataset to use (via the datasets library).", 71 | ) 72 | parser.add_argument( 73 | "--train_file", type=str, default=None, help="A csv or a json file containing the training data." 74 | ) 75 | parser.add_argument( 76 | "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data." 77 | ) 78 | parser.add_argument( 79 | "--validation_split_percentage", 80 | default=5, 81 | help="The percentage of the train set used as validation set in case there's no validation split", 82 | ) 83 | parser.add_argument( 84 | "--pad_to_max_length", 85 | action="store_true", 86 | help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.", 87 | ) 88 | parser.add_argument( 89 | "--model_name_or_path", 90 | type=str, 91 | help="Path to pretrained model or model identifier from huggingface.co/models.", 92 | required=True, 93 | ) 94 | parser.add_argument( 95 | "--config_name", 96 | type=str, 97 | default=None, 98 | help="Pretrained config name or path if not the same as model_name", 99 | ) 100 | parser.add_argument( 101 | "--tokenizer_name", 102 | type=str, 103 | default=None, 104 | help="Pretrained tokenizer name or path if not the same as model_name", 105 | ) 106 | parser.add_argument( 107 | "--use_slow_tokenizer", 108 | action="store_true", 109 | help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).", 110 | ) 111 | parser.add_argument( 112 | "--per_device_train_batch_size", 113 | type=int, 114 | default=8, 115 | help="Batch size (per device) for the training dataloader.", 116 | ) 117 | parser.add_argument( 118 | "--per_device_eval_batch_size", 119 | type=int, 120 | default=8, 121 | help="Batch size (per device) for the evaluation dataloader.", 122 | ) 123 | parser.add_argument( 124 | "--learning_rate", 125 | type=float, 126 | default=5e-5, 127 | help="Initial learning rate (after the potential warmup period) to use.", 128 | ) 129 | parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.") 130 | parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.") 131 | parser.add_argument( 132 | "--max_train_steps", 133 | type=int, 134 | default=None, 135 | help="Total number of training steps to perform. If provided, overrides num_train_epochs.", 136 | ) 137 | parser.add_argument( 138 | "--gradient_accumulation_steps", 139 | type=int, 140 | default=1, 141 | help="Number of updates steps to accumulate before performing a backward/update pass.", 142 | ) 143 | parser.add_argument( 144 | "--lr_scheduler_type", 145 | type=SchedulerType, 146 | default="linear", 147 | help="The scheduler type to use.", 148 | choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"], 149 | ) 150 | parser.add_argument( 151 | "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler." 152 | ) 153 | parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.") 154 | parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") 155 | parser.add_argument( 156 | "--model_type", 157 | type=str, 158 | default=None, 159 | help="Model type to use if training from scratch.", 160 | choices=MODEL_TYPES, 161 | ) 162 | parser.add_argument( 163 | "--max_seq_length", 164 | type=int, 165 | default=None, 166 | help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated.", 167 | ) 168 | parser.add_argument( 169 | "--line_by_line", 170 | type=bool, 171 | default=False, 172 | help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.", 173 | ) 174 | parser.add_argument( 175 | "--preprocessing_num_workers", 176 | type=int, 177 | default=None, 178 | help="The number of processes to use for the preprocessing.", 179 | ) 180 | parser.add_argument( 181 | "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets" 182 | ) 183 | parser.add_argument( 184 | "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss" 185 | ) 186 | 187 | args = parser.parse_args() 188 | 189 | # Sanity checks 190 | if args.dataset_name is None and args.train_file is None and args.validation_file is None: 191 | raise ValueError("Need either a dataset name or a training/validation file.") 192 | else: 193 | if args.train_file is not None: 194 | extension = args.train_file.split(".")[-1] 195 | assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file." 196 | if args.validation_file is not None: 197 | extension = args.validation_file.split(".")[-1] 198 | assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file." 199 | 200 | if args.output_dir is not None: 201 | os.makedirs(args.output_dir, exist_ok=True) 202 | 203 | return args 204 | 205 | 206 | def main(): 207 | args = parse_args() 208 | 209 | # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. 210 | accelerator = Accelerator() 211 | # Make one log on every process with the configuration for debugging. 212 | logging.basicConfig( 213 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 214 | datefmt="%m/%d/%Y %H:%M:%S", 215 | level=logging.INFO, 216 | ) 217 | logger.info(accelerator.state) 218 | 219 | # Setup logging, we only want one process per machine to log things on the screen. 220 | # accelerator.is_local_main_process is only True for one process per machine. 221 | logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR) 222 | if accelerator.is_local_main_process: 223 | datasets.utils.logging.set_verbosity_warning() 224 | transformers.utils.logging.set_verbosity_info() 225 | else: 226 | datasets.utils.logging.set_verbosity_error() 227 | transformers.utils.logging.set_verbosity_error() 228 | 229 | # If passed along, set the training seed now. 230 | if args.seed is not None: 231 | set_seed(args.seed) 232 | 233 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) 234 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ 235 | # (the dataset will be downloaded automatically from the datasets Hub). 236 | # 237 | # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called 238 | # 'text' is found. You can easily tweak this behavior (see below). 239 | # 240 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 241 | # download the dataset. 242 | if args.dataset_name is not None: 243 | # Downloading and loading a dataset from the hub. 244 | raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) 245 | if "validation" not in raw_datasets.keys(): 246 | raw_datasets["validation"] = load_dataset( 247 | args.dataset_name, 248 | args.dataset_config_name, 249 | split=f"train[:{args.validation_split_percentage}%]", 250 | ) 251 | raw_datasets["train"] = load_dataset( 252 | args.dataset_name, 253 | args.dataset_config_name, 254 | split=f"train[{args.validation_split_percentage}%:]", 255 | ) 256 | else: 257 | data_files = {} 258 | if args.train_file is not None: 259 | data_files["train"] = args.train_file 260 | if args.validation_file is not None: 261 | data_files["validation"] = args.validation_file 262 | extension = args.train_file.split(".")[-1] 263 | if extension == "txt": 264 | extension = "text" 265 | raw_datasets = load_dataset(extension, data_files=data_files) 266 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at 267 | # https://huggingface.co/docs/datasets/loading_datasets.html. 268 | 269 | # Load pretrained model and tokenizer 270 | # 271 | # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently 272 | # download model & vocab. 273 | if args.config_name: 274 | config = AutoConfig.from_pretrained(args.config_name) 275 | elif args.model_name_or_path: 276 | config = AutoConfig.from_pretrained(args.model_name_or_path) 277 | else: 278 | config = CONFIG_MAPPING[args.model_type]() 279 | logger.warning("You are instantiating a new config instance from scratch.") 280 | 281 | if args.tokenizer_name: 282 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer) 283 | elif args.model_name_or_path: 284 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer) 285 | else: 286 | raise ValueError( 287 | "You are instantiating a new tokenizer from scratch. This is not supported by this script." 288 | "You can do it from another script, save it, and load it from here, using --tokenizer_name." 289 | ) 290 | 291 | if args.model_name_or_path: 292 | model = AutoModelForMaskedLM.from_pretrained( 293 | args.model_name_or_path, 294 | from_tf=bool(".ckpt" in args.model_name_or_path), 295 | config=config, 296 | ) 297 | else: 298 | logger.info("Training new model from scratch") 299 | model = AutoModelForMaskedLM.from_config(config) 300 | 301 | model.resize_token_embeddings(len(tokenizer)) 302 | 303 | # Preprocessing the datasets. 304 | # First we tokenize all the texts. 305 | column_names = raw_datasets["train"].column_names 306 | text_column_name = "text" if "text" in column_names else column_names[0] 307 | 308 | if args.max_seq_length is None: 309 | max_seq_length = tokenizer.model_max_length 310 | if max_seq_length > 1024: 311 | logger.warning( 312 | f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " 313 | "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." 314 | ) 315 | max_seq_length = 1024 316 | else: 317 | if args.max_seq_length > tokenizer.model_max_length: 318 | logger.warning( 319 | f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the" 320 | f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." 321 | ) 322 | max_seq_length = min(args.max_seq_length, tokenizer.model_max_length) 323 | 324 | if args.line_by_line: 325 | # When using line_by_line, we just tokenize each nonempty line. 326 | padding = "max_length" if args.pad_to_max_length else False 327 | 328 | def tokenize_function(examples): 329 | # Remove empty lines 330 | examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] 331 | return tokenizer( 332 | examples["text"], 333 | padding=padding, 334 | truncation=True, 335 | max_length=max_seq_length, 336 | # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it 337 | # receives the `special_tokens_mask`. 338 | return_special_tokens_mask=True, 339 | ) 340 | 341 | tokenized_datasets = raw_datasets.map( 342 | tokenize_function, 343 | batched=True, 344 | num_proc=args.preprocessing_num_workers, 345 | remove_columns=[text_column_name], 346 | load_from_cache_file=not args.overwrite_cache, 347 | ) 348 | else: 349 | # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. 350 | # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more 351 | # efficient when it receives the `special_tokens_mask`. 352 | def tokenize_function(examples): 353 | return tokenizer(examples[text_column_name], return_special_tokens_mask=True) 354 | 355 | tokenized_datasets = raw_datasets.map( 356 | tokenize_function, 357 | batched=True, 358 | num_proc=args.preprocessing_num_workers, 359 | remove_columns=column_names, 360 | load_from_cache_file=not args.overwrite_cache, 361 | ) 362 | 363 | # Main data processing function that will concatenate all texts from our dataset and generate chunks of 364 | # max_seq_length. 365 | def group_texts(examples): 366 | # Concatenate all texts. 367 | concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} 368 | total_length = len(concatenated_examples[list(examples.keys())[0]]) 369 | # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can 370 | # customize this part to your needs. 371 | total_length = (total_length // max_seq_length) * max_seq_length 372 | # Split by chunks of max_len. 373 | result = { 374 | k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] 375 | for k, t in concatenated_examples.items() 376 | } 377 | return result 378 | 379 | # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a 380 | # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value 381 | # might be slower to preprocess. 382 | # 383 | # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: 384 | # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map 385 | 386 | tokenized_datasets = tokenized_datasets.map( 387 | group_texts, 388 | batched=True, 389 | num_proc=args.preprocessing_num_workers, 390 | load_from_cache_file=not args.overwrite_cache, 391 | ) 392 | 393 | train_dataset = tokenized_datasets["train"] 394 | eval_dataset = tokenized_datasets["validation"] 395 | 396 | # Log a few random samples from the training set: 397 | for index in random.sample(range(len(train_dataset)), 3): 398 | logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") 399 | 400 | # Data collator 401 | # This one will take care of randomly masking the tokens. 402 | data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=args.mlm_probability) 403 | 404 | # DataLoaders creation: 405 | train_dataloader = DataLoader( 406 | train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size 407 | ) 408 | eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) 409 | 410 | # Optimizer 411 | # Split weights in two groups, one with weight decay and the other not. 412 | no_decay = ["bias", "LayerNorm.weight"] 413 | optimizer_grouped_parameters = [ 414 | { 415 | "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 416 | "weight_decay": args.weight_decay, 417 | }, 418 | { 419 | "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 420 | "weight_decay": 0.0, 421 | }, 422 | ] 423 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) 424 | 425 | # Prepare everything with our `accelerator`. 426 | model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( 427 | model, optimizer, train_dataloader, eval_dataloader 428 | ) 429 | 430 | # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be 431 | # shorter in multiprocess) 432 | 433 | # Scheduler and math around the number of training steps. 434 | num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) 435 | if args.max_train_steps is None: 436 | args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch 437 | else: 438 | args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) 439 | 440 | lr_scheduler = get_scheduler( 441 | name=args.lr_scheduler_type, 442 | optimizer=optimizer, 443 | num_warmup_steps=args.num_warmup_steps, 444 | num_training_steps=args.max_train_steps, 445 | ) 446 | 447 | # Train! 448 | total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps 449 | 450 | logger.info("***** Running training *****") 451 | logger.info(f" Num examples = {len(train_dataset)}") 452 | logger.info(f" Num Epochs = {args.num_train_epochs}") 453 | logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") 454 | logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") 455 | logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") 456 | logger.info(f" Total optimization steps = {args.max_train_steps}") 457 | # Only show the progress bar once on each machine. 458 | progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) 459 | completed_steps = 0 460 | 461 | for epoch in range(args.num_train_epochs): 462 | model.train() 463 | for step, batch in enumerate(train_dataloader): 464 | outputs = model(**batch) 465 | loss = outputs.loss 466 | loss = loss / args.gradient_accumulation_steps 467 | accelerator.backward(loss) 468 | if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: 469 | optimizer.step() 470 | lr_scheduler.step() 471 | optimizer.zero_grad() 472 | progress_bar.update(1) 473 | completed_steps += 1 474 | 475 | if completed_steps >= args.max_train_steps: 476 | break 477 | 478 | model.eval() 479 | losses = [] 480 | for step, batch in enumerate(eval_dataloader): 481 | with torch.no_grad(): 482 | outputs = model(**batch) 483 | 484 | loss = outputs.loss 485 | losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size))) 486 | 487 | losses = torch.cat(losses) 488 | losses = losses[: len(eval_dataset)] 489 | perplexity = math.exp(torch.mean(losses)) 490 | 491 | logger.info(f"epoch {epoch}: perplexity: {perplexity}") 492 | 493 | if args.output_dir is not None: 494 | accelerator.wait_for_everyone() 495 | unwrapped_model = accelerator.unwrap_model(model) 496 | unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) 497 | 498 | 499 | if __name__ == "__main__": 500 | main() 501 | -------------------------------------------------------------------------------- /run_plm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2020 The HuggingFace Team All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning the library models for permutation language modeling. 18 | """ 19 | # You can also adapt this script on your own permutation language modeling task. Pointers for this are left as comments. 20 | 21 | import logging 22 | import math 23 | import os 24 | import sys 25 | from dataclasses import dataclass, field 26 | from typing import Optional 27 | 28 | from datasets import load_dataset 29 | 30 | import transformers 31 | from transformers import ( 32 | AutoConfig, 33 | AutoTokenizer, 34 | DataCollatorForPermutationLanguageModeling, 35 | HfArgumentParser, 36 | Trainer, 37 | TrainingArguments, 38 | XLNetConfig, 39 | XLNetLMHeadModel, 40 | set_seed, 41 | ) 42 | from transformers.trainer_utils import get_last_checkpoint, is_main_process 43 | from transformers.utils import check_min_version 44 | 45 | 46 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks. 47 | check_min_version("4.7.0.dev0") 48 | 49 | logger = logging.getLogger(__name__) 50 | 51 | 52 | @dataclass 53 | class ModelArguments: 54 | """ 55 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 56 | """ 57 | 58 | model_name_or_path: Optional[str] = field( 59 | default=None, 60 | metadata={ 61 | "help": "The model checkpoint for weights initialization." 62 | "Don't set if you want to train a model from scratch." 63 | }, 64 | ) 65 | config_name: Optional[str] = field( 66 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 67 | ) 68 | tokenizer_name: Optional[str] = field( 69 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 70 | ) 71 | cache_dir: Optional[str] = field( 72 | default=None, 73 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 74 | ) 75 | use_fast_tokenizer: bool = field( 76 | default=True, 77 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 78 | ) 79 | model_revision: str = field( 80 | default="main", 81 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 82 | ) 83 | use_auth_token: bool = field( 84 | default=False, 85 | metadata={ 86 | "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " 87 | "with private models)." 88 | }, 89 | ) 90 | 91 | 92 | @dataclass 93 | class DataTrainingArguments: 94 | """ 95 | Arguments pertaining to what data we are going to input our model for training and eval. 96 | """ 97 | 98 | dataset_name: Optional[str] = field( 99 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} 100 | ) 101 | dataset_config_name: Optional[str] = field( 102 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 103 | ) 104 | train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) 105 | validation_file: Optional[str] = field( 106 | default=None, 107 | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, 108 | ) 109 | overwrite_cache: bool = field( 110 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 111 | ) 112 | validation_split_percentage: Optional[int] = field( 113 | default=5, 114 | metadata={ 115 | "help": "The percentage of the train set used as validation set in case there's no validation split" 116 | }, 117 | ) 118 | max_seq_length: int = field( 119 | default=512, 120 | metadata={ 121 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 122 | "than this will be truncated." 123 | }, 124 | ) 125 | preprocessing_num_workers: Optional[int] = field( 126 | default=None, 127 | metadata={"help": "The number of processes to use for the preprocessing."}, 128 | ) 129 | plm_probability: float = field( 130 | default=1 / 6, 131 | metadata={ 132 | "help": "Ratio of length of a span of masked tokens to surrounding context length for " 133 | "permutation language modeling." 134 | }, 135 | ) 136 | max_span_length: int = field( 137 | default=5, metadata={"help": "Maximum length of a span of masked tokens for permutation language modeling."} 138 | ) 139 | line_by_line: bool = field( 140 | default=False, 141 | metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."}, 142 | ) 143 | pad_to_max_length: bool = field( 144 | default=False, 145 | metadata={ 146 | "help": "Whether to pad all samples to `max_seq_length`. " 147 | "If False, will pad the samples dynamically when batching to the maximum length in the batch." 148 | }, 149 | ) 150 | max_train_samples: Optional[int] = field( 151 | default=None, 152 | metadata={ 153 | "help": "For debugging purposes or quicker training, truncate the number of training examples to this " 154 | "value if set." 155 | }, 156 | ) 157 | max_eval_samples: Optional[int] = field( 158 | default=None, 159 | metadata={ 160 | "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 161 | "value if set." 162 | }, 163 | ) 164 | 165 | def __post_init__(self): 166 | if self.dataset_name is None and self.train_file is None and self.validation_file is None: 167 | raise ValueError("Need either a dataset name or a training/validation file.") 168 | else: 169 | if self.train_file is not None: 170 | extension = self.train_file.split(".")[-1] 171 | assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." 172 | if self.validation_file is not None: 173 | extension = self.validation_file.split(".")[-1] 174 | assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." 175 | 176 | 177 | def main(): 178 | # See all possible arguments in src/transformers/training_args.py 179 | # or by passing the --help flag to this script. 180 | # We now keep distinct sets of args, for a cleaner separation of concerns. 181 | 182 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 183 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 184 | # If we pass only one argument to the script and it's the path to a json file, 185 | # let's parse it to get our arguments. 186 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 187 | else: 188 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 189 | 190 | # Detecting last checkpoint. 191 | last_checkpoint = None 192 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 193 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 194 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 195 | raise ValueError( 196 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 197 | "Use --overwrite_output_dir to overcome." 198 | ) 199 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 200 | logger.info( 201 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 202 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 203 | ) 204 | 205 | # Setup logging 206 | logging.basicConfig( 207 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 208 | datefmt="%m/%d/%Y %H:%M:%S", 209 | handlers=[logging.StreamHandler(sys.stdout)], 210 | ) 211 | logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) 212 | 213 | # Log on each process the small summary: 214 | logger.warning( 215 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 216 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 217 | ) 218 | # Set the verbosity to info of the Transformers logger (on main process only): 219 | if is_main_process(training_args.local_rank): 220 | transformers.utils.logging.set_verbosity_info() 221 | transformers.utils.logging.enable_default_handler() 222 | transformers.utils.logging.enable_explicit_format() 223 | logger.info(f"Training/evaluation parameters {training_args}") 224 | 225 | # Set seed before initializing model. 226 | set_seed(training_args.seed) 227 | 228 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) 229 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ 230 | # (the dataset will be downloaded automatically from the datasets Hub). 231 | # 232 | # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called 233 | # 'text' is found. You can easily tweak this behavior (see below). 234 | # 235 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 236 | # download the dataset. 237 | if data_args.dataset_name is not None: 238 | # Downloading and loading a dataset from the hub. 239 | datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) 240 | if "validation" not in datasets.keys(): 241 | datasets["validation"] = load_dataset( 242 | data_args.dataset_name, 243 | data_args.dataset_config_name, 244 | split=f"train[:{data_args.validation_split_percentage}%]", 245 | cache_dir=model_args.cache_dir, 246 | ) 247 | datasets["train"] = load_dataset( 248 | data_args.dataset_name, 249 | data_args.dataset_config_name, 250 | split=f"train[{data_args.validation_split_percentage}%:]", 251 | cache_dir=model_args.cache_dir, 252 | ) 253 | else: 254 | data_files = {} 255 | if data_args.train_file is not None: 256 | data_files["train"] = data_args.train_file 257 | if data_args.validation_file is not None: 258 | data_files["validation"] = data_args.validation_file 259 | extension = data_args.train_file.split(".")[-1] 260 | if extension == "txt": 261 | extension = "text" 262 | datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) 263 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at 264 | # https://huggingface.co/docs/datasets/loading_datasets.html. 265 | 266 | # Load pretrained model and tokenizer 267 | # 268 | # Distributed training: 269 | # The .from_pretrained methods guarantee that only one local process can concurrently 270 | # download model & vocab. 271 | config_kwargs = { 272 | "cache_dir": model_args.cache_dir, 273 | "revision": model_args.model_revision, 274 | "use_auth_token": True if model_args.use_auth_token else None, 275 | } 276 | if model_args.config_name: 277 | config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) 278 | elif model_args.model_name_or_path: 279 | config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) 280 | else: 281 | config = XLNetConfig() 282 | logger.warning("You are instantiating a new config instance from scratch.") 283 | 284 | tokenizer_kwargs = { 285 | "cache_dir": model_args.cache_dir, 286 | "use_fast": model_args.use_fast_tokenizer, 287 | "revision": model_args.model_revision, 288 | "use_auth_token": True if model_args.use_auth_token else None, 289 | } 290 | if model_args.tokenizer_name: 291 | tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) 292 | elif model_args.model_name_or_path: 293 | tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) 294 | else: 295 | raise ValueError( 296 | "You are instantiating a new tokenizer from scratch. This is not supported by this script." 297 | "You can do it from another script, save it, and load it from here, using --tokenizer_name." 298 | ) 299 | 300 | if model_args.model_name_or_path: 301 | model = XLNetLMHeadModel.from_pretrained( 302 | model_args.model_name_or_path, 303 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 304 | config=config, 305 | cache_dir=model_args.cache_dir, 306 | revision=model_args.model_revision, 307 | use_auth_token=True if model_args.use_auth_token else None, 308 | ) 309 | else: 310 | logger.info("Training new model from scratch") 311 | model = XLNetLMHeadModel.from_config(config) 312 | 313 | model.resize_token_embeddings(len(tokenizer)) 314 | 315 | # Preprocessing the datasets. 316 | # First we tokenize all the texts. 317 | if training_args.do_train: 318 | column_names = datasets["train"].column_names 319 | else: 320 | column_names = datasets["validation"].column_names 321 | text_column_name = "text" if "text" in column_names else column_names[0] 322 | 323 | if data_args.max_seq_length > tokenizer.model_max_length: 324 | logger.warning( 325 | f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" 326 | f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." 327 | ) 328 | max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) 329 | 330 | if data_args.line_by_line: 331 | # When using line_by_line, we just tokenize each nonempty line. 332 | padding = "max_length" if data_args.pad_to_max_length else False 333 | 334 | def tokenize_function(examples): 335 | # Remove empty lines 336 | examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] 337 | return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length) 338 | 339 | tokenized_datasets = datasets.map( 340 | tokenize_function, 341 | batched=True, 342 | num_proc=data_args.preprocessing_num_workers, 343 | remove_columns=[text_column_name], 344 | load_from_cache_file=not data_args.overwrite_cache, 345 | ) 346 | else: 347 | # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. 348 | def tokenize_function(examples): 349 | return tokenizer(examples[text_column_name]) 350 | 351 | tokenized_datasets = datasets.map( 352 | tokenize_function, 353 | batched=True, 354 | num_proc=data_args.preprocessing_num_workers, 355 | remove_columns=column_names, 356 | load_from_cache_file=not data_args.overwrite_cache, 357 | ) 358 | 359 | # Main data processing function that will concatenate all texts from our dataset and generate chunks of 360 | # max_seq_length. 361 | def group_texts(examples): 362 | # Concatenate all texts. 363 | concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} 364 | total_length = len(concatenated_examples[list(examples.keys())[0]]) 365 | # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can 366 | # customize this part to your needs. 367 | total_length = (total_length // max_seq_length) * max_seq_length 368 | # Split by chunks of max_len. 369 | result = { 370 | k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] 371 | for k, t in concatenated_examples.items() 372 | } 373 | return result 374 | 375 | # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a 376 | # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value 377 | # might be slower to preprocess. 378 | # 379 | # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: 380 | # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map 381 | 382 | tokenized_datasets = tokenized_datasets.map( 383 | group_texts, 384 | batched=True, 385 | num_proc=data_args.preprocessing_num_workers, 386 | load_from_cache_file=not data_args.overwrite_cache, 387 | ) 388 | 389 | if training_args.do_train: 390 | if "train" not in tokenized_datasets: 391 | raise ValueError("--do_train requires a train dataset") 392 | train_dataset = tokenized_datasets["train"] 393 | if data_args.max_train_samples is not None: 394 | train_dataset = train_dataset.select(range(data_args.max_train_samples)) 395 | 396 | if training_args.do_eval: 397 | if "validation" not in tokenized_datasets: 398 | raise ValueError("--do_eval requires a validation dataset") 399 | eval_dataset = tokenized_datasets["validation"] 400 | if data_args.max_eval_samples is not None: 401 | eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) 402 | 403 | # Data collator 404 | data_collator = DataCollatorForPermutationLanguageModeling( 405 | tokenizer=tokenizer, 406 | plm_probability=data_args.plm_probability, 407 | max_span_length=data_args.max_span_length, 408 | ) 409 | 410 | # Initialize our Trainer 411 | trainer = Trainer( 412 | model=model, 413 | args=training_args, 414 | train_dataset=train_dataset if training_args.do_train else None, 415 | eval_dataset=eval_dataset if training_args.do_eval else None, 416 | tokenizer=tokenizer, 417 | data_collator=data_collator, 418 | ) 419 | 420 | # Training 421 | if training_args.do_train: 422 | checkpoint = None 423 | if training_args.resume_from_checkpoint is not None: 424 | checkpoint = training_args.resume_from_checkpoint 425 | elif last_checkpoint is not None: 426 | checkpoint = last_checkpoint 427 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 428 | trainer.save_model() # Saves the tokenizer too for easy upload 429 | metrics = train_result.metrics 430 | 431 | max_train_samples = ( 432 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 433 | ) 434 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 435 | 436 | trainer.log_metrics("train", metrics) 437 | trainer.save_metrics("train", metrics) 438 | trainer.save_state() 439 | 440 | # Evaluation 441 | if training_args.do_eval: 442 | logger.info("*** Evaluate ***") 443 | 444 | metrics = trainer.evaluate() 445 | 446 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 447 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 448 | perplexity = math.exp(metrics["eval_loss"]) 449 | metrics["perplexity"] = perplexity 450 | 451 | trainer.log_metrics("eval", metrics) 452 | trainer.save_metrics("eval", metrics) 453 | 454 | if training_args.push_to_hub: 455 | kwargs = {"finetuned_from": model_args.model_name_or_path, "tags": "language-modeling"} 456 | if data_args.dataset_name is not None: 457 | kwargs["dataset_tags"] = data_args.dataset_name 458 | if data_args.dataset_config_name is not None: 459 | kwargs["dataset_args"] = data_args.dataset_config_name 460 | kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" 461 | else: 462 | kwargs["dataset"] = data_args.dataset_name 463 | 464 | trainer.push_to_hub(**kwargs) 465 | 466 | 467 | def _mp_fn(index): 468 | # For xla_spawn (TPUs) 469 | main() 470 | 471 | 472 | if __name__ == "__main__": 473 | main() 474 | -------------------------------------------------------------------------------- /zero3_gpu_run_mlm.sh: -------------------------------------------------------------------------------- 1 | # export NCCL_IB_DISABLE=1 2 | export BS=32 3 | export NCCL_DEBUG=INFO 4 | 5 | deepspeed run_mlm.py \ 6 | --seed 42 \ 7 | --model_type bert \ 8 | --tokenizer_name beomi/KcELECTRA-base \ 9 | --train_file ./sampled_20190101_20200611_v2.txt \ 10 | --num_train_epochs 2 \ 11 | --per_device_train_batch_size $BS \ 12 | --per_device_eval_batch_size $BS \ 13 | --do_train \ 14 | --output_dir ./test-bert-zero3 \ 15 | --fp16 \ 16 | --logging_first_step \ 17 | --max_seq_length 300 \ 18 | --deepspeed ./ds_zero3_1gpu.json --------------------------------------------------------------------------------