├── requirements.txt ├── transformers ├── commands │ ├── __init__.py │ └── user.py ├── data │ ├── processors │ │ ├── __init__.py │ │ ├── xnli.py │ │ └── utils.py │ ├── __init__.py │ └── metrics │ │ └── __init__.py ├── configuration_camembert.py ├── configuration_roberta.py ├── convert_bert_original_tf_checkpoint_to_pytorch.py ├── convert_albert_original_tf_checkpoint_to_pytorch.py ├── tokenization_distilbert.py ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py ├── convert_openai_original_tf_checkpoint_to_pytorch.py ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py ├── configuration_distilbert.py ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py ├── convert_bert_pytorch_checkpoint_to_original_tf.py ├── configuration_albert.py ├── configuration_openai.py ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py ├── configuration_ctrl.py ├── configuration_gpt2.py ├── configuration_xlnet.py ├── configuration_transfo_xl.py ├── __main__.py ├── hf_api.py ├── configuration_bert.py ├── tokenization_roberta.py ├── tokenization_camembert.py ├── optimization.py ├── tokenization_openai.py ├── configuration_xlm.py ├── modeling_tf_transfo_xl_utilities.py ├── tokenization_ctrl.py ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py ├── tokenization_auto.py ├── configuration_auto.py └── optimization_tf.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboardX 2 | tensorboard 3 | scikit-learn 4 | seqeval 5 | -------------------------------------------------------------------------------- /transformers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from argparse import ArgumentParser 3 | 4 | class BaseTransformersCLICommand(ABC): 5 | @staticmethod 6 | @abstractmethod 7 | def register_subcommand(parser: ArgumentParser): 8 | raise NotImplementedError() 9 | 10 | @abstractmethod 11 | def run(self): 12 | raise NotImplementedError() 13 | -------------------------------------------------------------------------------- /transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import InputExample, InputFeatures, DataProcessor 2 | from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features 3 | from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor 4 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels -------------------------------------------------------------------------------- /transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures 2 | from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features 3 | from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor 4 | from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels 5 | 6 | from .metrics import is_sklearn_available 7 | if is_sklearn_available(): 8 | from .metrics import glue_compute_metrics, xnli_compute_metrics 9 | -------------------------------------------------------------------------------- /transformers/configuration_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ CamemBERT configuration """ 17 | 18 | from __future__ import (absolute_import, division, print_function, 19 | unicode_literals) 20 | 21 | import logging 22 | 23 | from .configuration_roberta import RobertaConfig 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 28 | 'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json", 29 | } 30 | 31 | 32 | class CamembertConfig(RobertaConfig): 33 | pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ALBERT + DUMA 2 | 3 | This is the source code of our paper 《[DUMA: Reading Comprehension with Transposition Thinking](https://ieeexplore.ieee.org/document/9664302)》. The codes are written based on https://github.com/huggingface/transformers . 4 | 5 | The codes are tested with pytorch 1.0.0 and python 3.6. If you want to use fp16 for training, please make sure the version is commit 33512f9 of https://github.com/NVIDIA/apex . 6 | 7 | It is recommended to download the model, config and vocab file and replace the path in trainsformers/{modeling_albert.py, configuration_albert.py, tokenization_albert.py}. 8 | 9 | Download the train.json, dev.json, test.json from https://github.com/nlpdata/dream/tree/master/data and save them into DATA_DIR. 10 | 11 | To run ALBERT on DREAM dataset, the script is: 12 | ```bash 13 | export DATA_DIR=/path/to/data 14 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python run_multiple_choice.py \ 15 | --do_lower_case \ 16 | --do_train \ 17 | --do_eval \ 18 | --overwrite_output \ 19 | --overwrite_cache \ 20 | --eval_all_checkpoints \ 21 | --task_name dream \ 22 | --per_gpu_eval_batch_size=10 \ 23 | --logging_steps 1 \ 24 | --max_seq_length 512 \ 25 | --model_type albert \ 26 | --model_name_or_path albert-base-v2 \ 27 | --data_dir $DATA_DIR \ 28 | --learning_rate 5e-6 \ 29 | --num_train_epochs 15 \ 30 | --output_dir albert_base_dream \ 31 | --per_gpu_train_batch_size=1 \ 32 | --gradient_accumulation_steps 1 \ 33 | --warmup_steps 100 \ 34 | --save_steps 764 35 | ``` 36 | 37 | To run ALBERT+DUMA on DREAM dataset, you should replace AlbertForMultipleChoice with AlbertDUMAForMultipleChoice. 38 | 39 | Performance outputs of checkpoints will be saved in my_eval_results.txt . 40 | 41 | For Albert xxlarge, please refer to the paper for parameter settings. 42 | 43 | More details will be added in the future. 44 | -------------------------------------------------------------------------------- /transformers/configuration_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ RoBERTa configuration """ 17 | 18 | from __future__ import (absolute_import, division, print_function, 19 | unicode_literals) 20 | 21 | import logging 22 | 23 | from .configuration_bert import BertConfig 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 28 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", 29 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", 30 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", 31 | 'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json", 32 | 'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json", 33 | 'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json", 34 | } 35 | 36 | 37 | class RobertaConfig(BertConfig): 38 | pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 39 | -------------------------------------------------------------------------------- /transformers/convert_bert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import argparse 22 | import torch 23 | 24 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert 25 | 26 | import logging 27 | logging.basicConfig(level=logging.INFO) 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = BertConfig.from_json_file(bert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = BertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_bert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | ## Required parameters 46 | parser.add_argument("--tf_checkpoint_path", 47 | default = None, 48 | type = str, 49 | required = True, 50 | help = "Path to the TensorFlow checkpoint path.") 51 | parser.add_argument("--bert_config_file", 52 | default = None, 53 | type = str, 54 | required = True, 55 | help = "The config json file corresponding to the pre-trained BERT model. \n" 56 | "This specifies the model architecture.") 57 | parser.add_argument("--pytorch_dump_path", 58 | default = None, 59 | type = str, 60 | required = True, 61 | help = "Path to the output PyTorch model.") 62 | args = parser.parse_args() 63 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, 64 | args.bert_config_file, 65 | args.pytorch_dump_path) 66 | -------------------------------------------------------------------------------- /transformers/convert_albert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert ALBERT checkpoint.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import argparse 22 | import torch 23 | 24 | from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert 25 | 26 | import logging 27 | logging.basicConfig(level=logging.INFO) 28 | 29 | 30 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): 31 | # Initialise PyTorch model 32 | config = AlbertConfig.from_json_file(albert_config_file) 33 | print("Building PyTorch model from configuration: {}".format(str(config))) 34 | model = AlbertForMaskedLM(config) 35 | 36 | # Load weights from tf checkpoint 37 | load_tf_weights_in_albert(model, config, tf_checkpoint_path) 38 | 39 | # Save pytorch-model 40 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 41 | torch.save(model.state_dict(), pytorch_dump_path) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | ## Required parameters 47 | parser.add_argument("--tf_checkpoint_path", 48 | default = None, 49 | type = str, 50 | required = True, 51 | help = "Path to the TensorFlow checkpoint path.") 52 | parser.add_argument("--albert_config_file", 53 | default = None, 54 | type = str, 55 | required = True, 56 | help = "The config json file corresponding to the pre-trained ALBERT model. \n" 57 | "This specifies the model architecture.") 58 | parser.add_argument("--pytorch_dump_path", 59 | default = None, 60 | type = str, 61 | required = True, 62 | help = "Path to the output PyTorch model.") 63 | args = parser.parse_args() 64 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, 65 | args.albert_config_file, 66 | args.pytorch_dump_path) 67 | -------------------------------------------------------------------------------- /transformers/tokenization_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for DistilBERT.""" 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import collections 20 | import logging 21 | import os 22 | import unicodedata 23 | from io import open 24 | 25 | from .tokenization_bert import BertTokenizer 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} 30 | 31 | PRETRAINED_VOCAB_FILES_MAP = { 32 | 'vocab_file': 33 | { 34 | 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 35 | 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", 36 | 'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt", 37 | 'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", 38 | } 39 | } 40 | 41 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 42 | 'distilbert-base-uncased': 512, 43 | 'distilbert-base-uncased-distilled-squad': 512, 44 | 'distilbert-base-german-cased': 512, 45 | 'distilbert-base-multilingual-cased': 512, 46 | } 47 | 48 | 49 | class DistilBertTokenizer(BertTokenizer): 50 | r""" 51 | Constructs a DistilBertTokenizer. 52 | :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece 53 | 54 | Args: 55 | vocab_file: Path to a one-wordpiece-per-line vocabulary file 56 | do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False 57 | do_basic_tokenize: Whether to do basic tokenization before wordpiece. 58 | max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the 59 | minimum of this value (if specified) and the underlying BERT model's sequence length. 60 | never_split: List of tokens which will never be split during tokenization. Only has an effect when 61 | do_wordpiece_only=False 62 | """ 63 | 64 | vocab_files_names = VOCAB_FILES_NAMES 65 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 66 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 67 | -------------------------------------------------------------------------------- /transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | from io import open 21 | 22 | import torch 23 | 24 | from transformers import (CONFIG_NAME, WEIGHTS_NAME, 25 | GPT2Config, 26 | GPT2Model, 27 | load_tf_weights_in_gpt2) 28 | 29 | import logging 30 | logging.basicConfig(level=logging.INFO) 31 | 32 | 33 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): 34 | # Construct model 35 | if gpt2_config_file == "": 36 | config = GPT2Config() 37 | else: 38 | config = GPT2Config.from_json_file(gpt2_config_file) 39 | model = GPT2Model(config) 40 | 41 | # Load weights from numpy 42 | load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) 43 | 44 | # Save pytorch-model 45 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 46 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 47 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 48 | torch.save(model.state_dict(), pytorch_weights_dump_path) 49 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 50 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 51 | f.write(config.to_json_string()) 52 | 53 | 54 | if __name__ == "__main__": 55 | parser = argparse.ArgumentParser() 56 | ## Required parameters 57 | parser.add_argument("--gpt2_checkpoint_path", 58 | default = None, 59 | type = str, 60 | required = True, 61 | help = "Path to the TensorFlow checkpoint path.") 62 | parser.add_argument("--pytorch_dump_folder_path", 63 | default = None, 64 | type = str, 65 | required = True, 66 | help = "Path to the output PyTorch model.") 67 | parser.add_argument("--gpt2_config_file", 68 | default = "", 69 | type = str, 70 | help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" 71 | "This specifies the model architecture.") 72 | args = parser.parse_args() 73 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, 74 | args.gpt2_config_file, 75 | args.pytorch_dump_folder_path) 76 | -------------------------------------------------------------------------------- /transformers/data/processors/xnli.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XNLI utils (dataset loading and evaluation) """ 17 | 18 | from __future__ import absolute_import, division, print_function 19 | 20 | import logging 21 | import os 22 | 23 | from .utils import DataProcessor, InputExample 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | class XnliProcessor(DataProcessor): 28 | """Processor for the XNLI dataset. 29 | Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207""" 30 | 31 | def __init__(self, language, train_language = None): 32 | self.language = language 33 | self.train_language = train_language 34 | 35 | def get_train_examples(self, data_dir): 36 | """See base class.""" 37 | lg = self.language if self.train_language is None else self.train_language 38 | lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg))) 39 | examples = [] 40 | for (i, line) in enumerate(lines): 41 | if i == 0: 42 | continue 43 | guid = "%s-%s" % ('train', i) 44 | text_a = line[0] 45 | text_b = line[1] 46 | label = "contradiction" if line[2] == "contradictory" else line[2] 47 | assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) 48 | examples.append( 49 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 50 | return examples 51 | 52 | def get_test_examples(self, data_dir): 53 | """See base class.""" 54 | lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv")) 55 | examples = [] 56 | for (i, line) in enumerate(lines): 57 | if i == 0: 58 | continue 59 | language = line[0] 60 | if language != self.language: 61 | continue 62 | guid = "%s-%s" % ('test', i) 63 | text_a = line[6] 64 | text_b = line[7] 65 | label = line[1] 66 | assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) 67 | examples.append( 68 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 69 | return examples 70 | 71 | def get_labels(self): 72 | """See base class.""" 73 | return ["contradiction", "entailment", "neutral"] 74 | 75 | xnli_processors = { 76 | "xnli": XnliProcessor, 77 | } 78 | 79 | xnli_output_modes = { 80 | "xnli": "classification", 81 | } 82 | 83 | xnli_tasks_num_labels = { 84 | "xnli": 3, 85 | } 86 | -------------------------------------------------------------------------------- /transformers/convert_openai_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | from io import open 21 | 22 | import torch 23 | 24 | from transformers import (CONFIG_NAME, WEIGHTS_NAME, 25 | OpenAIGPTConfig, 26 | OpenAIGPTModel, 27 | load_tf_weights_in_openai_gpt) 28 | 29 | import logging 30 | logging.basicConfig(level=logging.INFO) 31 | 32 | 33 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): 34 | # Construct model 35 | if openai_config_file == "": 36 | config = OpenAIGPTConfig() 37 | else: 38 | config = OpenAIGPTConfig.from_json_file(openai_config_file) 39 | model = OpenAIGPTModel(config) 40 | 41 | # Load weights from numpy 42 | load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path) 43 | 44 | # Save pytorch-model 45 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 46 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 47 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 48 | torch.save(model.state_dict(), pytorch_weights_dump_path) 49 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 50 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 51 | f.write(config.to_json_string()) 52 | 53 | 54 | if __name__ == "__main__": 55 | parser = argparse.ArgumentParser() 56 | ## Required parameters 57 | parser.add_argument("--openai_checkpoint_folder_path", 58 | default = None, 59 | type = str, 60 | required = True, 61 | help = "Path to the TensorFlow checkpoint path.") 62 | parser.add_argument("--pytorch_dump_folder_path", 63 | default = None, 64 | type = str, 65 | required = True, 66 | help = "Path to the output PyTorch model.") 67 | parser.add_argument("--openai_config_file", 68 | default = "", 69 | type = str, 70 | help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" 71 | "This specifies the model architecture.") 72 | args = parser.parse_args() 73 | convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path, 74 | args.openai_config_file, 75 | args.pytorch_dump_folder_path) 76 | -------------------------------------------------------------------------------- /transformers/data/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import csv 18 | import sys 19 | import logging 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | try: 24 | from scipy.stats import pearsonr, spearmanr 25 | from sklearn.metrics import matthews_corrcoef, f1_score 26 | _has_sklearn = True 27 | except (AttributeError, ImportError) as e: 28 | logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html") 29 | _has_sklearn = False 30 | 31 | def is_sklearn_available(): 32 | return _has_sklearn 33 | 34 | if _has_sklearn: 35 | 36 | def simple_accuracy(preds, labels): 37 | return (preds == labels).mean() 38 | 39 | 40 | def acc_and_f1(preds, labels): 41 | acc = simple_accuracy(preds, labels) 42 | f1 = f1_score(y_true=labels, y_pred=preds) 43 | return { 44 | "acc": acc, 45 | "f1": f1, 46 | "acc_and_f1": (acc + f1) / 2, 47 | } 48 | 49 | 50 | def pearson_and_spearman(preds, labels): 51 | pearson_corr = pearsonr(preds, labels)[0] 52 | spearman_corr = spearmanr(preds, labels)[0] 53 | return { 54 | "pearson": pearson_corr, 55 | "spearmanr": spearman_corr, 56 | "corr": (pearson_corr + spearman_corr) / 2, 57 | } 58 | 59 | 60 | def glue_compute_metrics(task_name, preds, labels): 61 | assert len(preds) == len(labels) 62 | if task_name == "cola": 63 | return {"mcc": matthews_corrcoef(labels, preds)} 64 | elif task_name == "sst-2": 65 | return {"acc": simple_accuracy(preds, labels)} 66 | elif task_name == "mrpc": 67 | return acc_and_f1(preds, labels) 68 | elif task_name == "sts-b": 69 | return pearson_and_spearman(preds, labels) 70 | elif task_name == "qqp": 71 | return acc_and_f1(preds, labels) 72 | elif task_name == "mnli": 73 | return {"acc": simple_accuracy(preds, labels)} 74 | elif task_name == "mnli-mm": 75 | return {"acc": simple_accuracy(preds, labels)} 76 | elif task_name == "qnli": 77 | return {"acc": simple_accuracy(preds, labels)} 78 | elif task_name == "rte": 79 | return {"acc": simple_accuracy(preds, labels)} 80 | elif task_name == "wnli": 81 | return {"acc": simple_accuracy(preds, labels)} 82 | else: 83 | raise KeyError(task_name) 84 | 85 | 86 | def xnli_compute_metrics(task_name, preds, labels): 87 | assert len(preds) == len(labels) 88 | if task_name == "xnli": 89 | return {"acc": simple_accuracy(preds, labels)} 90 | else: 91 | raise KeyError(task_name) 92 | -------------------------------------------------------------------------------- /transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import json 21 | from io import open 22 | 23 | import torch 24 | import numpy 25 | 26 | from transformers import CONFIG_NAME, WEIGHTS_NAME 27 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES 28 | 29 | import logging 30 | logging.basicConfig(level=logging.INFO) 31 | 32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path): 33 | # Load checkpoint 34 | chkpt = torch.load(xlm_checkpoint_path, map_location='cpu') 35 | 36 | state_dict = chkpt['model'] 37 | 38 | # We have the base model one level deeper than the original XLM repository 39 | two_levels_state_dict = {} 40 | for k, v in state_dict.items(): 41 | if 'pred_layer' in k: 42 | two_levels_state_dict[k] = v 43 | else: 44 | two_levels_state_dict['transformer.' + k] = v 45 | 46 | config = chkpt['params'] 47 | config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))) 48 | 49 | vocab = chkpt['dico_word2id'] 50 | vocab = dict((s + '' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items()) 51 | 52 | # Save pytorch-model 53 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 54 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 55 | pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['vocab_file'] 56 | 57 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 58 | torch.save(two_levels_state_dict, pytorch_weights_dump_path) 59 | 60 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 61 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 62 | f.write(json.dumps(config, indent=2) + "\n") 63 | 64 | print("Save vocab file to {}".format(pytorch_config_dump_path)) 65 | with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f: 66 | f.write(json.dumps(vocab, indent=2) + "\n") 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser() 71 | ## Required parameters 72 | parser.add_argument("--xlm_checkpoint_path", 73 | default = None, 74 | type = str, 75 | required = True, 76 | help = "Path the official PyTorch dump.") 77 | parser.add_argument("--pytorch_dump_folder_path", 78 | default = None, 79 | type = str, 80 | required = True, 81 | help = "Path to the output PyTorch model.") 82 | args = parser.parse_args() 83 | convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path) 84 | -------------------------------------------------------------------------------- /transformers/configuration_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ DistilBERT model configuration """ 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import sys 20 | import json 21 | import logging 22 | from io import open 23 | 24 | from .configuration_utils import PretrainedConfig 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 29 | 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", 30 | 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json", 31 | 'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json", 32 | 'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json", 33 | } 34 | 35 | 36 | class DistilBertConfig(PretrainedConfig): 37 | pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 38 | 39 | def __init__(self, 40 | vocab_size_or_config_json_file=30522, 41 | max_position_embeddings=512, 42 | sinusoidal_pos_embds=False, 43 | n_layers=6, 44 | n_heads=12, 45 | dim=768, 46 | hidden_dim=4*768, 47 | dropout=0.1, 48 | attention_dropout=0.1, 49 | activation='gelu', 50 | initializer_range=0.02, 51 | tie_weights_=True, 52 | qa_dropout=0.1, 53 | seq_classif_dropout=0.2, 54 | **kwargs): 55 | super(DistilBertConfig, self).__init__(**kwargs) 56 | 57 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 58 | and isinstance(vocab_size_or_config_json_file, unicode)): 59 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 60 | json_config = json.loads(reader.read()) 61 | for key, value in json_config.items(): 62 | self.__dict__[key] = value 63 | elif isinstance(vocab_size_or_config_json_file, int): 64 | self.vocab_size = vocab_size_or_config_json_file 65 | self.max_position_embeddings = max_position_embeddings 66 | self.sinusoidal_pos_embds = sinusoidal_pos_embds 67 | self.n_layers = n_layers 68 | self.n_heads = n_heads 69 | self.dim = dim 70 | self.hidden_dim = hidden_dim 71 | self.dropout = dropout 72 | self.attention_dropout = attention_dropout 73 | self.activation = activation 74 | self.initializer_range = initializer_range 75 | self.tie_weights_ = tie_weights_ 76 | self.qa_dropout = qa_dropout 77 | self.seq_classif_dropout = seq_classif_dropout 78 | else: 79 | raise ValueError("First argument must be either a vocabulary size (int)" 80 | " or the path to a pretrained model config file (str)") 81 | @property 82 | def hidden_size(self): 83 | return self.dim 84 | 85 | @property 86 | def num_attention_heads(self): 87 | return self.n_heads 88 | 89 | @property 90 | def num_hidden_layers(self): 91 | return self.n_layers 92 | -------------------------------------------------------------------------------- /transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import argparse 23 | import torch 24 | 25 | from transformers import (CONFIG_NAME, WEIGHTS_NAME, 26 | XLNetConfig, 27 | XLNetLMHeadModel, XLNetForQuestionAnswering, 28 | XLNetForSequenceClassification, 29 | load_tf_weights_in_xlnet) 30 | 31 | GLUE_TASKS_NUM_LABELS = { 32 | "cola": 2, 33 | "mnli": 3, 34 | "mrpc": 2, 35 | "sst-2": 2, 36 | "sts-b": 1, 37 | "qqp": 2, 38 | "qnli": 2, 39 | "rte": 2, 40 | "wnli": 2, 41 | } 42 | 43 | import logging 44 | logging.basicConfig(level=logging.INFO) 45 | 46 | def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None): 47 | # Initialise PyTorch model 48 | config = XLNetConfig.from_json_file(bert_config_file) 49 | 50 | finetuning_task = finetuning_task.lower() if finetuning_task is not None else "" 51 | if finetuning_task in GLUE_TASKS_NUM_LABELS: 52 | print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config))) 53 | config.finetuning_task = finetuning_task 54 | config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task] 55 | model = XLNetForSequenceClassification(config) 56 | elif 'squad' in finetuning_task: 57 | config.finetuning_task = finetuning_task 58 | model = XLNetForQuestionAnswering(config) 59 | else: 60 | model = XLNetLMHeadModel(config) 61 | 62 | # Load weights from tf checkpoint 63 | load_tf_weights_in_xlnet(model, config, tf_checkpoint_path) 64 | 65 | # Save pytorch-model 66 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) 67 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) 68 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) 69 | torch.save(model.state_dict(), pytorch_weights_dump_path) 70 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) 71 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 72 | f.write(config.to_json_string()) 73 | 74 | 75 | if __name__ == "__main__": 76 | parser = argparse.ArgumentParser() 77 | ## Required parameters 78 | parser.add_argument("--tf_checkpoint_path", 79 | default = None, 80 | type = str, 81 | required = True, 82 | help = "Path to the TensorFlow checkpoint path.") 83 | parser.add_argument("--xlnet_config_file", 84 | default = None, 85 | type = str, 86 | required = True, 87 | help = "The config json file corresponding to the pre-trained XLNet model. \n" 88 | "This specifies the model architecture.") 89 | parser.add_argument("--pytorch_dump_folder_path", 90 | default = None, 91 | type = str, 92 | required = True, 93 | help = "Path to the folder to store the PyTorch model or dataset/vocab.") 94 | parser.add_argument("--finetuning_task", 95 | default = None, 96 | type = str, 97 | help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned") 98 | args = parser.parse_args() 99 | print(args) 100 | 101 | convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path, 102 | args.xlnet_config_file, 103 | args.pytorch_dump_folder_path, 104 | args.finetuning_task) 105 | -------------------------------------------------------------------------------- /transformers/convert_bert_pytorch_checkpoint_to_original_tf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" 17 | 18 | import os 19 | import argparse 20 | import torch 21 | import numpy as np 22 | import tensorflow as tf 23 | from transformers import BertModel 24 | 25 | 26 | def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str): 27 | 28 | """ 29 | :param model:BertModel Pytorch model instance to be converted 30 | :param ckpt_dir: Tensorflow model directory 31 | :param model_name: model name 32 | :return: 33 | 34 | Currently supported HF models: 35 | Y BertModel 36 | N BertForMaskedLM 37 | N BertForPreTraining 38 | N BertForMultipleChoice 39 | N BertForNextSentencePrediction 40 | N BertForSequenceClassification 41 | N BertForQuestionAnswering 42 | """ 43 | 44 | tensors_to_transpose = ( 45 | "dense.weight", 46 | "attention.self.query", 47 | "attention.self.key", 48 | "attention.self.value" 49 | ) 50 | 51 | var_map = ( 52 | ('layer.', 'layer_'), 53 | ('word_embeddings.weight', 'word_embeddings'), 54 | ('position_embeddings.weight', 'position_embeddings'), 55 | ('token_type_embeddings.weight', 'token_type_embeddings'), 56 | ('.', '/'), 57 | ('LayerNorm/weight', 'LayerNorm/gamma'), 58 | ('LayerNorm/bias', 'LayerNorm/beta'), 59 | ('weight', 'kernel') 60 | ) 61 | 62 | if not os.path.isdir(ckpt_dir): 63 | os.makedirs(ckpt_dir) 64 | 65 | state_dict = model.state_dict() 66 | 67 | def to_tf_var_name(name:str): 68 | for patt, repl in iter(var_map): 69 | name = name.replace(patt, repl) 70 | return 'bert/{}'.format(name) 71 | 72 | def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session): 73 | tf_dtype = tf.dtypes.as_dtype(tensor.dtype) 74 | tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) 75 | session.run(tf.variables_initializer([tf_var])) 76 | session.run(tf_var) 77 | return tf_var 78 | 79 | tf.reset_default_graph() 80 | with tf.Session() as session: 81 | for var_name in state_dict: 82 | tf_name = to_tf_var_name(var_name) 83 | torch_tensor = state_dict[var_name].numpy() 84 | if any([x in var_name for x in tensors_to_transpose]): 85 | torch_tensor = torch_tensor.T 86 | tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) 87 | tf.keras.backend.set_value(tf_var, torch_tensor) 88 | tf_weight = session.run(tf_var) 89 | print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor))) 90 | 91 | saver = tf.train.Saver(tf.trainable_variables()) 92 | saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) 93 | 94 | 95 | def main(raw_args=None): 96 | parser = argparse.ArgumentParser() 97 | parser.add_argument("--model_name", 98 | type=str, 99 | required=True, 100 | help="model name e.g. bert-base-uncased") 101 | parser.add_argument("--cache_dir", 102 | type=str, 103 | default=None, 104 | required=False, 105 | help="Directory containing pytorch model") 106 | parser.add_argument("--pytorch_model_path", 107 | type=str, 108 | required=True, 109 | help="/path/to/.bin") 110 | parser.add_argument("--tf_cache_dir", 111 | type=str, 112 | required=True, 113 | help="Directory in which to save tensorflow model") 114 | args = parser.parse_args(raw_args) 115 | 116 | model = BertModel.from_pretrained( 117 | pretrained_model_name_or_path=args.model_name, 118 | state_dict=torch.load(args.pytorch_model_path), 119 | cache_dir=args.cache_dir 120 | ) 121 | 122 | convert_pytorch_checkpoint_to_tf( 123 | model=model, 124 | ckpt_dir=args.tf_cache_dir, 125 | model_name=args.model_name 126 | ) 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | -------------------------------------------------------------------------------- /transformers/data/processors/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import csv 18 | import sys 19 | import copy 20 | import json 21 | 22 | class InputExample(object): 23 | """ 24 | A single training/test example for simple sequence classification. 25 | 26 | Args: 27 | guid: Unique id for the example. 28 | text_a: string. The untokenized text of the first sequence. For single 29 | sequence tasks, only this sequence must be specified. 30 | text_b: (Optional) string. The untokenized text of the second sequence. 31 | Only must be specified for sequence pair tasks. 32 | label: (Optional) string. The label of the example. This should be 33 | specified for train and dev examples, but not for test examples. 34 | """ 35 | def __init__(self, guid, text_a, text_b=None, label=None): 36 | self.guid = guid 37 | self.text_a = text_a 38 | self.text_b = text_b 39 | self.label = label 40 | 41 | def __repr__(self): 42 | return str(self.to_json_string()) 43 | 44 | def to_dict(self): 45 | """Serializes this instance to a Python dictionary.""" 46 | output = copy.deepcopy(self.__dict__) 47 | return output 48 | 49 | def to_json_string(self): 50 | """Serializes this instance to a JSON string.""" 51 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 52 | 53 | 54 | class InputFeatures(object): 55 | """ 56 | A single set of features of data. 57 | 58 | Args: 59 | input_ids: Indices of input sequence tokens in the vocabulary. 60 | attention_mask: Mask to avoid performing attention on padding token indices. 61 | Mask values selected in ``[0, 1]``: 62 | Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. 63 | token_type_ids: Segment token indices to indicate first and second portions of the inputs. 64 | label: Label corresponding to the input 65 | """ 66 | 67 | def __init__(self, input_ids, attention_mask, token_type_ids, label): 68 | self.input_ids = input_ids 69 | self.attention_mask = attention_mask 70 | self.token_type_ids = token_type_ids 71 | self.label = label 72 | 73 | def __repr__(self): 74 | return str(self.to_json_string()) 75 | 76 | def to_dict(self): 77 | """Serializes this instance to a Python dictionary.""" 78 | output = copy.deepcopy(self.__dict__) 79 | return output 80 | 81 | def to_json_string(self): 82 | """Serializes this instance to a JSON string.""" 83 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 84 | 85 | 86 | class DataProcessor(object): 87 | """Base class for data converters for sequence classification data sets.""" 88 | 89 | def get_example_from_tensor_dict(self, tensor_dict): 90 | """Gets an example from a dict with tensorflow tensors 91 | 92 | Args: 93 | tensor_dict: Keys and values should match the corresponding Glue 94 | tensorflow_dataset examples. 95 | """ 96 | raise NotImplementedError() 97 | 98 | def get_train_examples(self, data_dir): 99 | """Gets a collection of `InputExample`s for the train set.""" 100 | raise NotImplementedError() 101 | 102 | def get_dev_examples(self, data_dir): 103 | """Gets a collection of `InputExample`s for the dev set.""" 104 | raise NotImplementedError() 105 | 106 | def get_labels(self): 107 | """Gets the list of labels for this data set.""" 108 | raise NotImplementedError() 109 | 110 | def tfds_map(self, example): 111 | """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. 112 | This method converts examples to the correct format.""" 113 | if len(self.get_labels()) > 1: 114 | example.label = self.get_labels()[int(example.label)] 115 | return example 116 | 117 | @classmethod 118 | def _read_tsv(cls, input_file, quotechar=None): 119 | """Reads a tab separated value file.""" 120 | with open(input_file, "r", encoding="utf-8-sig") as f: 121 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar) 122 | lines = [] 123 | for line in reader: 124 | if sys.version_info[0] == 2: 125 | line = list(unicode(cell, 'utf-8') for cell in line) 126 | lines.append(line) 127 | return lines 128 | -------------------------------------------------------------------------------- /transformers/configuration_albert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ ALBERT model configuration """ 17 | 18 | from .configuration_utils import PretrainedConfig 19 | 20 | ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 21 | 'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json", 22 | 'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json", 23 | 'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json", 24 | 'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json", 25 | 'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json", 26 | 'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json", 27 | 'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json", 28 | 'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json", 29 | } 30 | 31 | class AlbertConfig(PretrainedConfig): 32 | """Configuration for `AlbertModel`. 33 | 34 | The default settings match the configuration of model `albert_xxlarge`. 35 | """ 36 | 37 | pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 38 | 39 | def __init__(self, 40 | vocab_size_or_config_json_file=30000, 41 | embedding_size=128, 42 | hidden_size=4096, 43 | num_hidden_layers=12, 44 | num_hidden_groups=1, 45 | num_attention_heads=64, 46 | intermediate_size=16384, 47 | inner_group_num=1, 48 | hidden_act="gelu_new", 49 | hidden_dropout_prob=0, 50 | attention_probs_dropout_prob=0, 51 | max_position_embeddings=512, 52 | type_vocab_size=2, 53 | initializer_range=0.02, 54 | layer_norm_eps=1e-12, **kwargs): 55 | """Constructs AlbertConfig. 56 | 57 | Args: 58 | vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`. 59 | embedding_size: size of voc embeddings. 60 | hidden_size: Size of the encoder layers and the pooler layer. 61 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 62 | num_hidden_groups: Number of group for the hidden layers, parameters in 63 | the same group are shared. 64 | num_attention_heads: Number of attention heads for each attention layer in 65 | the Transformer encoder. 66 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 67 | layer in the Transformer encoder. 68 | inner_group_num: int, number of inner repetition of attention and ffn. 69 | down_scale_factor: float, the scale to apply 70 | hidden_act: The non-linear activation function (function or string) in the 71 | encoder and pooler. 72 | hidden_dropout_prob: The dropout probability for all fully connected 73 | layers in the embeddings, encoder, and pooler. 74 | attention_probs_dropout_prob: The dropout ratio for the attention 75 | probabilities. 76 | max_position_embeddings: The maximum sequence length that this model might 77 | ever be used with. Typically set this to something large just in case 78 | (e.g., 512 or 1024 or 2048). 79 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 80 | `AlbertModel`. 81 | initializer_range: The stdev of the truncated_normal_initializer for 82 | initializing all weight matrices. 83 | """ 84 | super(AlbertConfig, self).__init__(**kwargs) 85 | 86 | self.vocab_size = vocab_size_or_config_json_file 87 | self.embedding_size = embedding_size 88 | self.hidden_size = hidden_size 89 | self.num_hidden_layers = num_hidden_layers 90 | self.num_hidden_groups = num_hidden_groups 91 | self.num_attention_heads = num_attention_heads 92 | self.inner_group_num = inner_group_num 93 | self.hidden_act = hidden_act 94 | self.intermediate_size = intermediate_size 95 | self.hidden_dropout_prob = hidden_dropout_prob 96 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 97 | self.max_position_embeddings = max_position_embeddings 98 | self.type_vocab_size = type_vocab_size 99 | self.initializer_range = initializer_range 100 | self.layer_norm_eps = layer_norm_eps -------------------------------------------------------------------------------- /transformers/configuration_openai.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ OpenAI GPT configuration """ 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | import json 21 | import logging 22 | import sys 23 | from io import open 24 | 25 | from .configuration_utils import PretrainedConfig 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 30 | "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json" 31 | } 32 | 33 | class OpenAIGPTConfig(PretrainedConfig): 34 | """ 35 | Configuration class to store the configuration of a `OpenAIGPTModel`. 36 | 37 | Args: 38 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. 39 | n_positions: Number of positional embeddings. 40 | n_ctx: Size of the causal mask (usually same as n_positions). 41 | n_embd: Dimensionality of the embeddings and hidden states. 42 | n_layer: Number of hidden layers in the Transformer encoder. 43 | n_head: Number of attention heads for each attention layer in 44 | the Transformer encoder. 45 | afn: The non-linear activation function (function or string) in the 46 | encoder and pooler. If string, "gelu", "relu" and "swish" are supported. 47 | resid_pdrop: The dropout probabilitiy for all fully connected 48 | layers in the embeddings, encoder, and pooler. 49 | attn_pdrop: The dropout ratio for the attention 50 | probabilities. 51 | embd_pdrop: The dropout ratio for the embeddings. 52 | layer_norm_epsilon: epsilon to use in the layer norm layers 53 | initializer_range: The sttdev of the truncated_normal_initializer for 54 | initializing all weight matrices. 55 | predict_special_tokens: should we predict special tokens (when the model has a LM head) 56 | """ 57 | pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP 58 | 59 | def __init__( 60 | self, 61 | vocab_size_or_config_json_file=40478, 62 | n_positions=512, 63 | n_ctx=512, 64 | n_embd=768, 65 | n_layer=12, 66 | n_head=12, 67 | afn="gelu", 68 | resid_pdrop=0.1, 69 | embd_pdrop=0.1, 70 | attn_pdrop=0.1, 71 | layer_norm_epsilon=1e-5, 72 | initializer_range=0.02, 73 | predict_special_tokens=True, 74 | 75 | num_labels=1, 76 | summary_type='cls_index', 77 | summary_use_proj=True, 78 | summary_activation=None, 79 | summary_proj_to_labels=True, 80 | summary_first_dropout=0.1, 81 | **kwargs 82 | ): 83 | """Constructs OpenAIGPTConfig. 84 | """ 85 | super(OpenAIGPTConfig, self).__init__(**kwargs) 86 | 87 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 88 | and isinstance(vocab_size_or_config_json_file, unicode)): 89 | with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: 90 | json_config = json.loads(reader.read()) 91 | for key, value in json_config.items(): 92 | self.__dict__[key] = value 93 | elif isinstance(vocab_size_or_config_json_file, int): 94 | self.vocab_size = vocab_size_or_config_json_file 95 | self.n_ctx = n_ctx 96 | self.n_positions = n_positions 97 | self.n_embd = n_embd 98 | self.n_layer = n_layer 99 | self.n_head = n_head 100 | self.afn = afn 101 | self.resid_pdrop = resid_pdrop 102 | self.embd_pdrop = embd_pdrop 103 | self.attn_pdrop = attn_pdrop 104 | self.layer_norm_epsilon = layer_norm_epsilon 105 | self.initializer_range = initializer_range 106 | self.predict_special_tokens = predict_special_tokens 107 | 108 | self.num_labels = num_labels 109 | self.summary_type = summary_type 110 | self.summary_use_proj = summary_use_proj 111 | self.summary_activation = summary_activation 112 | self.summary_first_dropout = summary_first_dropout 113 | self.summary_proj_to_labels = summary_proj_to_labels 114 | else: 115 | raise ValueError( 116 | "First argument must be either a vocabulary size (int)" 117 | "or the path to a pretrained model config file (str)" 118 | ) 119 | 120 | @property 121 | def max_position_embeddings(self): 122 | return self.n_positions 123 | 124 | @property 125 | def hidden_size(self): 126 | return self.n_embd 127 | 128 | @property 129 | def num_attention_heads(self): 130 | return self.n_head 131 | 132 | @property 133 | def num_hidden_layers(self): 134 | return self.n_layer 135 | -------------------------------------------------------------------------------- /transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert Transformer XL checkpoint and datasets.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import os 21 | import sys 22 | from io import open 23 | 24 | import torch 25 | 26 | import transformers.tokenization_transfo_xl as data_utils 27 | 28 | from transformers import CONFIG_NAME, WEIGHTS_NAME 29 | from transformers import (TransfoXLConfig, TransfoXLLMHeadModel, 30 | load_tf_weights_in_transfo_xl) 31 | from transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES) 32 | 33 | if sys.version_info[0] == 2: 34 | import cPickle as pickle 35 | else: 36 | import pickle 37 | 38 | import logging 39 | logging.basicConfig(level=logging.INFO) 40 | 41 | # We do this to be able to load python 2 datasets pickles 42 | # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918 43 | data_utils.Vocab = data_utils.TransfoXLTokenizer 44 | data_utils.Corpus = data_utils.TransfoXLCorpus 45 | sys.modules['data_utils'] = data_utils 46 | sys.modules['vocabulary'] = data_utils 47 | 48 | def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path, 49 | transfo_xl_config_file, 50 | pytorch_dump_folder_path, 51 | transfo_xl_dataset_file): 52 | if transfo_xl_dataset_file: 53 | # Convert a pre-processed corpus (see original TensorFlow repo) 54 | with open(transfo_xl_dataset_file, "rb") as fp: 55 | corpus = pickle.load(fp, encoding="latin1") 56 | # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) 57 | pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['pretrained_vocab_file'] 58 | print("Save vocabulary to {}".format(pytorch_vocab_dump_path)) 59 | corpus_vocab_dict = corpus.vocab.__dict__ 60 | torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) 61 | 62 | corpus_dict_no_vocab = corpus.__dict__ 63 | corpus_dict_no_vocab.pop('vocab', None) 64 | pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME 65 | print("Save dataset to {}".format(pytorch_dataset_dump_path)) 66 | torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) 67 | 68 | if tf_checkpoint_path: 69 | # Convert a pre-trained TensorFlow model 70 | config_path = os.path.abspath(transfo_xl_config_file) 71 | tf_path = os.path.abspath(tf_checkpoint_path) 72 | 73 | print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path)) 74 | # Initialise PyTorch model 75 | if transfo_xl_config_file == "": 76 | config = TransfoXLConfig() 77 | else: 78 | config = TransfoXLConfig.from_json_file(transfo_xl_config_file) 79 | print("Building PyTorch model from configuration: {}".format(str(config))) 80 | model = TransfoXLLMHeadModel(config) 81 | 82 | model = load_tf_weights_in_transfo_xl(model, config, tf_path) 83 | # Save pytorch-model 84 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) 85 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) 86 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) 87 | torch.save(model.state_dict(), pytorch_weights_dump_path) 88 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) 89 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 90 | f.write(config.to_json_string()) 91 | 92 | 93 | if __name__ == "__main__": 94 | parser = argparse.ArgumentParser() 95 | parser.add_argument("--pytorch_dump_folder_path", 96 | default = None, 97 | type = str, 98 | required = True, 99 | help = "Path to the folder to store the PyTorch model or dataset/vocab.") 100 | parser.add_argument("--tf_checkpoint_path", 101 | default = "", 102 | type = str, 103 | help = "An optional path to a TensorFlow checkpoint path to be converted.") 104 | parser.add_argument("--transfo_xl_config_file", 105 | default = "", 106 | type = str, 107 | help = "An optional config json file corresponding to the pre-trained BERT model. \n" 108 | "This specifies the model architecture.") 109 | parser.add_argument("--transfo_xl_dataset_file", 110 | default = "", 111 | type = str, 112 | help = "An optional dataset file to be converted in a vocabulary.") 113 | args = parser.parse_args() 114 | convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path, 115 | args.transfo_xl_config_file, 116 | args.pytorch_dump_folder_path, 117 | args.transfo_xl_dataset_file) 118 | -------------------------------------------------------------------------------- /transformers/commands/user.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from getpass import getpass 3 | import os 4 | 5 | from transformers.commands import BaseTransformersCLICommand 6 | from transformers.hf_api import HfApi, HfFolder, HTTPError 7 | 8 | 9 | class UserCommands(BaseTransformersCLICommand): 10 | @staticmethod 11 | def register_subcommand(parser: ArgumentParser): 12 | login_parser = parser.add_parser('login') 13 | login_parser.set_defaults(func=lambda args: LoginCommand(args)) 14 | whoami_parser = parser.add_parser('whoami') 15 | whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args)) 16 | logout_parser = parser.add_parser('logout') 17 | logout_parser.set_defaults(func=lambda args: LogoutCommand(args)) 18 | list_parser = parser.add_parser('ls') 19 | list_parser.set_defaults(func=lambda args: ListObjsCommand(args)) 20 | # upload 21 | upload_parser = parser.add_parser('upload') 22 | upload_parser.add_argument('file', type=str, help='Local filepath of the file to upload.') 23 | upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override object filename on S3.') 24 | upload_parser.set_defaults(func=lambda args: UploadCommand(args)) 25 | 26 | 27 | 28 | class ANSI: 29 | """ 30 | Helper for en.wikipedia.org/wiki/ANSI_escape_code 31 | """ 32 | _bold = u"\u001b[1m" 33 | _reset = u"\u001b[0m" 34 | @classmethod 35 | def bold(cls, s): 36 | return "{}{}{}".format(cls._bold, s, cls._reset) 37 | 38 | 39 | class BaseUserCommand: 40 | def __init__(self, args): 41 | self.args = args 42 | self._api = HfApi() 43 | 44 | 45 | class LoginCommand(BaseUserCommand): 46 | def run(self): 47 | print(""" 48 | _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_| 49 | _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _| 50 | _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_| 51 | _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _| 52 | _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_| 53 | 54 | """) 55 | username = input("Username: ") 56 | password = getpass() 57 | try: 58 | token = self._api.login(username, password) 59 | except HTTPError as e: 60 | # probably invalid credentials, display error message. 61 | print(e) 62 | exit(1) 63 | HfFolder.save_token(token) 64 | print("Login successful") 65 | print("Your token:", token, "\n") 66 | print("Your token has been saved to", HfFolder.path_token) 67 | 68 | 69 | class WhoamiCommand(BaseUserCommand): 70 | def run(self): 71 | token = HfFolder.get_token() 72 | if token is None: 73 | print("Not logged in") 74 | exit() 75 | try: 76 | user = self._api.whoami(token) 77 | print(user) 78 | except HTTPError as e: 79 | print(e) 80 | 81 | 82 | class LogoutCommand(BaseUserCommand): 83 | def run(self): 84 | token = HfFolder.get_token() 85 | if token is None: 86 | print("Not logged in") 87 | exit() 88 | HfFolder.delete_token() 89 | self._api.logout(token) 90 | print("Successfully logged out.") 91 | 92 | 93 | class ListObjsCommand(BaseUserCommand): 94 | def tabulate(self, rows, headers): 95 | # type: (List[List[Union[str, int]]], List[str]) -> str 96 | """ 97 | Inspired by: 98 | stackoverflow.com/a/8356620/593036 99 | stackoverflow.com/questions/9535954/printing-lists-as-tabular-data 100 | """ 101 | col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)] 102 | row_format = ("{{:{}}} " * len(headers)).format(*col_widths) 103 | lines = [] 104 | lines.append( 105 | row_format.format(*headers) 106 | ) 107 | lines.append( 108 | row_format.format(*["-" * w for w in col_widths]) 109 | ) 110 | for row in rows: 111 | lines.append( 112 | row_format.format(*row) 113 | ) 114 | return "\n".join(lines) 115 | 116 | def run(self): 117 | token = HfFolder.get_token() 118 | if token is None: 119 | print("Not logged in") 120 | exit(1) 121 | try: 122 | objs = self._api.list_objs(token) 123 | except HTTPError as e: 124 | print(e) 125 | exit(1) 126 | if len(objs) == 0: 127 | print("No shared file yet") 128 | exit() 129 | rows = [ [ 130 | obj.filename, 131 | obj.LastModified, 132 | obj.ETag, 133 | obj.Size 134 | ] for obj in objs ] 135 | print( 136 | self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"]) 137 | ) 138 | 139 | 140 | class UploadCommand(BaseUserCommand): 141 | def run(self): 142 | token = HfFolder.get_token() 143 | if token is None: 144 | print("Not logged in") 145 | exit(1) 146 | filepath = os.path.join(os.getcwd(), self.args.file) 147 | filename = self.args.filename if self.args.filename is not None else os.path.basename(filepath) 148 | print( 149 | "About to upload file {} to S3 under filename {}".format( 150 | ANSI.bold(filepath), ANSI.bold(filename) 151 | ) 152 | ) 153 | 154 | choice = input("Proceed? [Y/n] ").lower() 155 | if not(choice == "" or choice == "y" or choice == "yes"): 156 | print("Abort") 157 | exit() 158 | print( 159 | ANSI.bold("Uploading... This might take a while if file is large") 160 | ) 161 | access_url = self._api.presign_and_upload( 162 | token=token, filename=filename, filepath=filepath 163 | ) 164 | print("Your file now lives at:") 165 | print(access_url) 166 | -------------------------------------------------------------------------------- /transformers/configuration_ctrl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Salesforce and HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Salesforce CTRL configuration """ 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import json 20 | import logging 21 | import sys 22 | from io import open 23 | 24 | from .configuration_utils import PretrainedConfig 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"} 29 | 30 | class CTRLConfig(PretrainedConfig): 31 | """Configuration class to store the configuration of a `CTRLModel`. 32 | 33 | Args: 34 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. 35 | n_positions: Number of positional embeddings. 36 | n_ctx: Size of the causal mask (usually same as n_positions). 37 | dff: Size of the inner dimension of the FFN. 38 | n_embd: Dimensionality of the embeddings and hidden states. 39 | n_layer: Number of hidden layers in the Transformer encoder. 40 | n_head: Number of attention heads for each attention layer in 41 | the Transformer encoder. 42 | layer_norm_epsilon: epsilon to use in the layer norm layers 43 | resid_pdrop: The dropout probabilitiy for all fully connected 44 | layers in the embeddings, encoder, and pooler. 45 | attn_pdrop: The dropout ratio for the attention 46 | probabilities. 47 | embd_pdrop: The dropout ratio for the embeddings. 48 | initializer_range: The sttdev of the truncated_normal_initializer for 49 | initializing all weight matrices. 50 | """ 51 | pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP 52 | 53 | def __init__( 54 | self, 55 | vocab_size_or_config_json_file=246534, 56 | n_positions=256, 57 | n_ctx=256, 58 | n_embd=1280, 59 | dff=8192, 60 | n_layer=48, 61 | n_head=16, 62 | resid_pdrop=0.1, 63 | embd_pdrop=0.1, 64 | attn_pdrop=0.1, 65 | layer_norm_epsilon=1e-6, 66 | initializer_range=0.02, 67 | 68 | num_labels=1, 69 | summary_type='cls_index', 70 | summary_use_proj=True, 71 | summary_activation=None, 72 | summary_proj_to_labels=True, 73 | summary_first_dropout=0.1, 74 | **kwargs 75 | ): 76 | """Constructs CTRLConfig. 77 | 78 | Args: 79 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. 80 | n_positions: Number of positional embeddings. 81 | n_ctx: Size of the causal mask (usually same as n_positions). 82 | dff: Size of the inner dimension of the FFN. 83 | n_embd: Dimensionality of the embeddings and hidden states. 84 | n_layer: Number of hidden layers in the Transformer encoder. 85 | n_head: Number of attention heads for each attention layer in 86 | the Transformer encoder. 87 | layer_norm_epsilon: epsilon to use in the layer norm layers 88 | resid_pdrop: The dropout probabilitiy for all fully connected 89 | layers in the embeddings, encoder, and pooler. 90 | attn_pdrop: The dropout ratio for the attention 91 | probabilities. 92 | embd_pdrop: The dropout ratio for the embeddings. 93 | initializer_range: The sttdev of the truncated_normal_initializer for 94 | initializing all weight matrices. 95 | """ 96 | super(CTRLConfig, self).__init__(**kwargs) 97 | 98 | self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 99 | self.n_ctx = n_ctx 100 | self.n_positions = n_positions 101 | self.n_embd = n_embd 102 | self.n_layer = n_layer 103 | self.n_head = n_head 104 | self.dff = dff 105 | self.resid_pdrop = resid_pdrop 106 | self.embd_pdrop = embd_pdrop 107 | self.attn_pdrop = attn_pdrop 108 | self.layer_norm_epsilon = layer_norm_epsilon 109 | self.initializer_range = initializer_range 110 | 111 | self.num_labels = num_labels 112 | self.summary_type = summary_type 113 | self.summary_use_proj = summary_use_proj 114 | self.summary_activation = summary_activation 115 | self.summary_first_dropout = summary_first_dropout 116 | self.summary_proj_to_labels = summary_proj_to_labels 117 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 118 | and isinstance(vocab_size_or_config_json_file, unicode)): 119 | with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: 120 | json_config = json.loads(reader.read()) 121 | for key, value in json_config.items(): 122 | self.__dict__[key] = value 123 | elif not isinstance(vocab_size_or_config_json_file, int): 124 | raise ValueError( 125 | "First argument must be either a vocabulary size (int)" 126 | "or the path to a pretrained model config file (str)" 127 | ) 128 | 129 | @property 130 | def max_position_embeddings(self): 131 | return self.n_positions 132 | 133 | @property 134 | def hidden_size(self): 135 | return self.n_embd 136 | 137 | @property 138 | def num_attention_heads(self): 139 | return self.n_head 140 | 141 | @property 142 | def num_hidden_layers(self): 143 | return self.n_layer 144 | -------------------------------------------------------------------------------- /transformers/configuration_gpt2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ OpenAI GPT-2 configuration """ 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | import json 21 | import logging 22 | import sys 23 | from io import open 24 | 25 | from .configuration_utils import PretrainedConfig 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json", 30 | "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json", 31 | "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json", 32 | "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json", 33 | "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",} 34 | 35 | class GPT2Config(PretrainedConfig): 36 | """Configuration class to store the configuration of a `GPT2Model`. 37 | 38 | Args: 39 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. 40 | n_positions: Number of positional embeddings. 41 | n_ctx: Size of the causal mask (usually same as n_positions). 42 | n_embd: Dimensionality of the embeddings and hidden states. 43 | n_layer: Number of hidden layers in the Transformer encoder. 44 | n_head: Number of attention heads for each attention layer in 45 | the Transformer encoder. 46 | layer_norm_epsilon: epsilon to use in the layer norm layers 47 | resid_pdrop: The dropout probabilitiy for all fully connected 48 | layers in the embeddings, encoder, and pooler. 49 | attn_pdrop: The dropout ratio for the attention 50 | probabilities. 51 | embd_pdrop: The dropout ratio for the embeddings. 52 | initializer_range: The sttdev of the truncated_normal_initializer for 53 | initializing all weight matrices. 54 | """ 55 | pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP 56 | 57 | def __init__( 58 | self, 59 | vocab_size_or_config_json_file=50257, 60 | n_positions=1024, 61 | n_ctx=1024, 62 | n_embd=768, 63 | n_layer=12, 64 | n_head=12, 65 | resid_pdrop=0.1, 66 | embd_pdrop=0.1, 67 | attn_pdrop=0.1, 68 | layer_norm_epsilon=1e-5, 69 | initializer_range=0.02, 70 | 71 | num_labels=1, 72 | summary_type='cls_index', 73 | summary_use_proj=True, 74 | summary_activation=None, 75 | summary_proj_to_labels=True, 76 | summary_first_dropout=0.1, 77 | **kwargs 78 | ): 79 | """Constructs GPT2Config. 80 | 81 | Args: 82 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. 83 | n_positions: Number of positional embeddings. 84 | n_ctx: Size of the causal mask (usually same as n_positions). 85 | n_embd: Dimensionality of the embeddings and hidden states. 86 | n_layer: Number of hidden layers in the Transformer encoder. 87 | n_head: Number of attention heads for each attention layer in 88 | the Transformer encoder. 89 | layer_norm_epsilon: epsilon to use in the layer norm layers 90 | resid_pdrop: The dropout probabilitiy for all fully connected 91 | layers in the embeddings, encoder, and pooler. 92 | attn_pdrop: The dropout ratio for the attention 93 | probabilities. 94 | embd_pdrop: The dropout ratio for the embeddings. 95 | initializer_range: The sttdev of the truncated_normal_initializer for 96 | initializing all weight matrices. 97 | """ 98 | super(GPT2Config, self).__init__(**kwargs) 99 | 100 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 101 | and isinstance(vocab_size_or_config_json_file, unicode)): 102 | with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: 103 | json_config = json.loads(reader.read()) 104 | for key, value in json_config.items(): 105 | self.__dict__[key] = value 106 | elif isinstance(vocab_size_or_config_json_file, int): 107 | self.vocab_size = vocab_size_or_config_json_file 108 | self.n_ctx = n_ctx 109 | self.n_positions = n_positions 110 | self.n_embd = n_embd 111 | self.n_layer = n_layer 112 | self.n_head = n_head 113 | self.resid_pdrop = resid_pdrop 114 | self.embd_pdrop = embd_pdrop 115 | self.attn_pdrop = attn_pdrop 116 | self.layer_norm_epsilon = layer_norm_epsilon 117 | self.initializer_range = initializer_range 118 | 119 | self.num_labels = num_labels 120 | self.summary_type = summary_type 121 | self.summary_use_proj = summary_use_proj 122 | self.summary_activation = summary_activation 123 | self.summary_first_dropout = summary_first_dropout 124 | self.summary_proj_to_labels = summary_proj_to_labels 125 | else: 126 | raise ValueError( 127 | "First argument must be either a vocabulary size (int)" 128 | "or the path to a pretrained model config file (str)" 129 | ) 130 | 131 | @property 132 | def max_position_embeddings(self): 133 | return self.n_positions 134 | 135 | @property 136 | def hidden_size(self): 137 | return self.n_embd 138 | 139 | @property 140 | def num_attention_heads(self): 141 | return self.n_head 142 | 143 | @property 144 | def num_hidden_layers(self): 145 | return self.n_layer 146 | -------------------------------------------------------------------------------- /transformers/configuration_xlnet.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XLNet configuration """ 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import json 20 | import logging 21 | import sys 22 | from io import open 23 | 24 | from .configuration_utils import PretrainedConfig 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { 29 | 'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json", 30 | 'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json", 31 | } 32 | 33 | 34 | class XLNetConfig(PretrainedConfig): 35 | """Configuration class to store the configuration of a ``XLNetModel``. 36 | 37 | Args: 38 | vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. 39 | d_model: Size of the encoder layers and the pooler layer. 40 | n_layer: Number of hidden layers in the Transformer encoder. 41 | n_head: Number of attention heads for each attention layer in 42 | the Transformer encoder. 43 | d_inner: The size of the "intermediate" (i.e., feed-forward) 44 | layer in the Transformer encoder. 45 | ff_activation: The non-linear activation function (function or string) in the 46 | encoder and pooler. If string, "gelu", "relu" and "swish" are supported. 47 | untie_r: untie relative position biases 48 | attn_type: 'bi' for XLNet, 'uni' for Transformer-XL 49 | 50 | dropout: The dropout probabilitiy for all fully connected 51 | layers in the embeddings, encoder, and pooler. 52 | initializer_range: The sttdev of the truncated_normal_initializer for 53 | initializing all weight matrices. 54 | layer_norm_eps: The epsilon used by LayerNorm. 55 | 56 | dropout: float, dropout rate. 57 | init: str, the initialization scheme, either "normal" or "uniform". 58 | init_range: float, initialize the parameters with a uniform distribution 59 | in [-init_range, init_range]. Only effective when init="uniform". 60 | init_std: float, initialize the parameters with a normal distribution 61 | with mean 0 and stddev init_std. Only effective when init="normal". 62 | mem_len: int, the number of tokens to cache. 63 | reuse_len: int, the number of tokens in the currect batch to be cached 64 | and reused in the future. 65 | bi_data: bool, whether to use bidirectional input pipeline. 66 | Usually set to True during pretraining and False during finetuning. 67 | clamp_len: int, clamp all relative distances larger than clamp_len. 68 | -1 means no clamping. 69 | same_length: bool, whether to use the same attention length for each token. 70 | finetuning_task: name of the glue task on which the model was fine-tuned if any 71 | """ 72 | pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP 73 | 74 | def __init__(self, 75 | vocab_size_or_config_json_file=32000, 76 | d_model=1024, 77 | n_layer=24, 78 | n_head=16, 79 | d_inner=4096, 80 | max_position_embeddings=512, 81 | ff_activation="gelu", 82 | untie_r=True, 83 | attn_type="bi", 84 | 85 | initializer_range=0.02, 86 | layer_norm_eps=1e-12, 87 | 88 | dropout=0.1, 89 | mem_len=None, 90 | reuse_len=None, 91 | bi_data=False, 92 | clamp_len=-1, 93 | same_length=False, 94 | 95 | finetuning_task=None, 96 | num_labels=2, 97 | summary_type='last', 98 | summary_use_proj=True, 99 | summary_activation='tanh', 100 | summary_last_dropout=0.1, 101 | start_n_top=5, 102 | end_n_top=5, 103 | **kwargs): 104 | """Constructs XLNetConfig. 105 | """ 106 | super(XLNetConfig, self).__init__(**kwargs) 107 | 108 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 109 | and isinstance(vocab_size_or_config_json_file, unicode)): 110 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 111 | json_config = json.loads(reader.read()) 112 | for key, value in json_config.items(): 113 | setattr(config, key, value) 114 | elif isinstance(vocab_size_or_config_json_file, int): 115 | self.n_token = vocab_size_or_config_json_file 116 | self.d_model = d_model 117 | self.n_layer = n_layer 118 | self.n_head = n_head 119 | assert d_model % n_head == 0 120 | self.d_head = d_model // n_head 121 | self.ff_activation = ff_activation 122 | self.d_inner = d_inner 123 | self.untie_r = untie_r 124 | self.attn_type = attn_type 125 | 126 | self.initializer_range = initializer_range 127 | self.layer_norm_eps = layer_norm_eps 128 | 129 | self.dropout = dropout 130 | self.mem_len = mem_len 131 | self.reuse_len = reuse_len 132 | self.bi_data = bi_data 133 | self.clamp_len = clamp_len 134 | self.same_length = same_length 135 | 136 | self.finetuning_task = finetuning_task 137 | self.num_labels = num_labels 138 | self.summary_type = summary_type 139 | self.summary_use_proj = summary_use_proj 140 | self.summary_activation = summary_activation 141 | self.summary_last_dropout = summary_last_dropout 142 | self.start_n_top = start_n_top 143 | self.end_n_top = end_n_top 144 | else: 145 | raise ValueError("First argument must be either a vocabulary size (int)" 146 | " or the path to a pretrained model config file (str)") 147 | 148 | @property 149 | def max_position_embeddings(self): 150 | return -1 151 | 152 | @property 153 | def vocab_size(self): 154 | return self.n_token 155 | 156 | @vocab_size.setter 157 | def vocab_size(self, value): 158 | self.n_token = value 159 | 160 | @property 161 | def hidden_size(self): 162 | return self.d_model 163 | 164 | @property 165 | def num_attention_heads(self): 166 | return self.n_head 167 | 168 | @property 169 | def num_hidden_layers(self): 170 | return self.n_layer 171 | -------------------------------------------------------------------------------- /transformers/configuration_transfo_xl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ Transformer XL configuration """ 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | import json 21 | import logging 22 | import sys 23 | from io import open 24 | 25 | from .configuration_utils import PretrainedConfig 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { 30 | 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json", 31 | } 32 | 33 | class TransfoXLConfig(PretrainedConfig): 34 | """Configuration class to store the configuration of a `TransfoXLModel`. 35 | 36 | Args: 37 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. 38 | cutoffs: cutoffs for the adaptive softmax 39 | d_model: Dimensionality of the model's hidden states. 40 | d_embed: Dimensionality of the embeddings 41 | d_head: Dimensionality of the model's heads. 42 | div_val: divident value for adapative input and softmax 43 | pre_lnorm: apply LayerNorm to the input instead of the output 44 | d_inner: Inner dimension in FF 45 | n_layer: Number of hidden layers in the Transformer encoder. 46 | n_head: Number of attention heads for each attention layer in 47 | the Transformer encoder. 48 | tgt_len: number of tokens to predict 49 | ext_len: length of the extended context 50 | mem_len: length of the retained previous heads 51 | same_length: use the same attn length for all tokens 52 | proj_share_all_but_first: True to share all but first projs, False not to share. 53 | attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. 54 | clamp_len: use the same pos embeddings after clamp_len 55 | sample_softmax: number of samples in sampled softmax 56 | adaptive: use adaptive softmax 57 | tie_weight: tie the word embedding and softmax weights 58 | dropout: The dropout probabilitiy for all fully connected 59 | layers in the embeddings, encoder, and pooler. 60 | dropatt: The dropout ratio for the attention probabilities. 61 | untie_r: untie relative position biases 62 | embd_pdrop: The dropout ratio for the embeddings. 63 | init: parameter initializer to use 64 | init_range: parameters initialized by U(-init_range, init_range). 65 | proj_init_std: parameters initialized by N(0, init_std) 66 | init_std: parameters initialized by N(0, init_std) 67 | """ 68 | pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP 69 | 70 | def __init__(self, 71 | vocab_size_or_config_json_file=267735, 72 | cutoffs=[20000, 40000, 200000], 73 | d_model=1024, 74 | d_embed=1024, 75 | n_head=16, 76 | d_head=64, 77 | d_inner=4096, 78 | div_val=4, 79 | pre_lnorm=False, 80 | n_layer=18, 81 | tgt_len=128, 82 | ext_len=0, 83 | mem_len=1600, 84 | clamp_len=1000, 85 | same_length=True, 86 | proj_share_all_but_first=True, 87 | attn_type=0, 88 | sample_softmax=-1, 89 | adaptive=True, 90 | tie_weight=True, 91 | dropout=0.1, 92 | dropatt=0.0, 93 | untie_r=True, 94 | init="normal", 95 | init_range=0.01, 96 | proj_init_std=0.01, 97 | init_std=0.02, 98 | layer_norm_epsilon=1e-5, 99 | **kwargs): 100 | """Constructs TransfoXLConfig. 101 | """ 102 | super(TransfoXLConfig, self).__init__(**kwargs) 103 | self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 104 | self.cutoffs = [] 105 | self.cutoffs.extend(cutoffs) 106 | self.tie_weight = tie_weight 107 | if proj_share_all_but_first: 108 | self.tie_projs = [False] + [True] * len(self.cutoffs) 109 | else: 110 | self.tie_projs = [False] + [False] * len(self.cutoffs) 111 | self.d_model = d_model 112 | self.d_embed = d_embed 113 | self.d_head = d_head 114 | self.d_inner = d_inner 115 | self.div_val = div_val 116 | self.pre_lnorm = pre_lnorm 117 | self.n_layer = n_layer 118 | self.n_head = n_head 119 | self.tgt_len = tgt_len 120 | self.ext_len = ext_len 121 | self.mem_len = mem_len 122 | self.same_length = same_length 123 | self.attn_type = attn_type 124 | self.clamp_len = clamp_len 125 | self.sample_softmax = sample_softmax 126 | self.adaptive = adaptive 127 | self.dropout = dropout 128 | self.dropatt = dropatt 129 | self.untie_r = untie_r 130 | self.init = init 131 | self.init_range = init_range 132 | self.proj_init_std = proj_init_std 133 | self.init_std = init_std 134 | self.layer_norm_epsilon = layer_norm_epsilon 135 | 136 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 137 | and isinstance(vocab_size_or_config_json_file, unicode)): 138 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 139 | json_config = json.loads(reader.read()) 140 | for key, value in json_config.items(): 141 | self.__dict__[key] = value 142 | elif not isinstance(vocab_size_or_config_json_file, int): 143 | raise ValueError("First argument must be either a vocabulary size (int)" 144 | " or the path to a pretrained model config file (str)") 145 | 146 | @property 147 | def max_position_embeddings(self): 148 | return self.tgt_len + self.ext_len + self.mem_len 149 | 150 | @property 151 | def vocab_size(self): 152 | return self.n_token 153 | 154 | @vocab_size.setter 155 | def vocab_size(self, value): 156 | self.n_token = value 157 | 158 | @property 159 | def hidden_size(self): 160 | return self.d_model 161 | 162 | @property 163 | def num_attention_heads(self): 164 | return self.n_head 165 | 166 | @property 167 | def num_hidden_layers(self): 168 | return self.n_layer 169 | -------------------------------------------------------------------------------- /transformers/__main__.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | def main(): 3 | import sys 4 | if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]: 5 | print( 6 | "This command line utility let you convert original (author released) model checkpoint to pytorch.\n" 7 | "It should be used as one of: \n" 8 | ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n" 9 | ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n" 10 | ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n" 11 | ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n" 12 | ">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n" 13 | ">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT") 14 | else: 15 | if sys.argv[1] == "bert": 16 | try: 17 | from .convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch 18 | except ImportError: 19 | print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 20 | "In that case, it requires TensorFlow to be installed. Please see " 21 | "https://www.tensorflow.org/install/ for installation instructions.") 22 | raise 23 | 24 | if len(sys.argv) != 5: 25 | # pylint: disable=line-too-long 26 | print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") 27 | else: 28 | PYTORCH_DUMP_OUTPUT = sys.argv.pop() 29 | TF_CONFIG = sys.argv.pop() 30 | TF_CHECKPOINT = sys.argv.pop() 31 | convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) 32 | elif sys.argv[1] == "gpt": 33 | from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch 34 | if len(sys.argv) < 4 or len(sys.argv) > 5: 35 | # pylint: disable=line-too-long 36 | print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`") 37 | else: 38 | OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2] 39 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 40 | if len(sys.argv) == 5: 41 | OPENAI_GPT_CONFIG = sys.argv[4] 42 | else: 43 | OPENAI_GPT_CONFIG = "" 44 | convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH, 45 | OPENAI_GPT_CONFIG, 46 | PYTORCH_DUMP_OUTPUT) 47 | elif sys.argv[1] == "transfo_xl": 48 | try: 49 | from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch 50 | except ImportError: 51 | print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 52 | "In that case, it requires TensorFlow to be installed. Please see " 53 | "https://www.tensorflow.org/install/ for installation instructions.") 54 | raise 55 | if len(sys.argv) < 4 or len(sys.argv) > 5: 56 | # pylint: disable=line-too-long 57 | print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`") 58 | else: 59 | if 'ckpt' in sys.argv[2].lower(): 60 | TF_CHECKPOINT = sys.argv[2] 61 | TF_DATASET_FILE = "" 62 | else: 63 | TF_DATASET_FILE = sys.argv[2] 64 | TF_CHECKPOINT = "" 65 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 66 | if len(sys.argv) == 5: 67 | TF_CONFIG = sys.argv[4] 68 | else: 69 | TF_CONFIG = "" 70 | convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE) 71 | elif sys.argv[1] == "gpt2": 72 | try: 73 | from .convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch 74 | except ImportError: 75 | print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 76 | "In that case, it requires TensorFlow to be installed. Please see " 77 | "https://www.tensorflow.org/install/ for installation instructions.") 78 | raise 79 | 80 | if len(sys.argv) < 4 or len(sys.argv) > 5: 81 | # pylint: disable=line-too-long 82 | print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`") 83 | else: 84 | TF_CHECKPOINT = sys.argv[2] 85 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 86 | if len(sys.argv) == 5: 87 | TF_CONFIG = sys.argv[4] 88 | else: 89 | TF_CONFIG = "" 90 | convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) 91 | elif sys.argv[1] == "xlnet": 92 | try: 93 | from .convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch 94 | except ImportError: 95 | print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 96 | "In that case, it requires TensorFlow to be installed. Please see " 97 | "https://www.tensorflow.org/install/ for installation instructions.") 98 | raise 99 | 100 | if len(sys.argv) < 5 or len(sys.argv) > 6: 101 | # pylint: disable=line-too-long 102 | print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`") 103 | else: 104 | TF_CHECKPOINT = sys.argv[2] 105 | TF_CONFIG = sys.argv[3] 106 | PYTORCH_DUMP_OUTPUT = sys.argv[4] 107 | if len(sys.argv) == 6: 108 | FINETUNING_TASK = sys.argv[5] 109 | else: 110 | FINETUNING_TASK = None 111 | 112 | convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT, 113 | TF_CONFIG, 114 | PYTORCH_DUMP_OUTPUT, 115 | FINETUNING_TASK) 116 | elif sys.argv[1] == "xlm": 117 | from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch 118 | 119 | if len(sys.argv) != 4: 120 | # pylint: disable=line-too-long 121 | print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`") 122 | else: 123 | XLM_CHECKPOINT_PATH = sys.argv[2] 124 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 125 | 126 | convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT) 127 | 128 | if __name__ == '__main__': 129 | main() 130 | -------------------------------------------------------------------------------- /transformers/hf_api.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function 16 | 17 | import os 18 | from os.path import expanduser 19 | 20 | import requests 21 | import six 22 | from requests.exceptions import HTTPError 23 | from tqdm import tqdm 24 | 25 | ENDPOINT = "https://huggingface.co" 26 | 27 | class S3Obj: 28 | def __init__( 29 | self, 30 | filename, # type: str 31 | LastModified, # type: str 32 | ETag, # type: str 33 | Size, # type: int 34 | **kwargs 35 | ): 36 | self.filename = filename 37 | self.LastModified = LastModified 38 | self.ETag = ETag 39 | self.Size = Size 40 | 41 | 42 | class PresignedUrl: 43 | def __init__( 44 | self, 45 | write, # type: str 46 | access, # type: str 47 | type, # type: str 48 | **kwargs 49 | ): 50 | self.write = write 51 | self.access = access 52 | self.type = type # mime-type to send to S3. 53 | 54 | 55 | class HfApi: 56 | def __init__(self, endpoint=None): 57 | self.endpoint = endpoint if endpoint is not None else ENDPOINT 58 | 59 | def login( 60 | self, 61 | username, # type: str 62 | password, # type: str 63 | ): 64 | # type: (...) -> str 65 | """ 66 | Call HF API to sign in a user and get a token if credentials are valid. 67 | 68 | Outputs: 69 | token if credentials are valid 70 | 71 | Throws: 72 | requests.exceptions.HTTPError if credentials are invalid 73 | """ 74 | path = "{}/api/login".format(self.endpoint) 75 | r = requests.post(path, json={"username": username, "password": password}) 76 | r.raise_for_status() 77 | d = r.json() 78 | return d["token"] 79 | 80 | def whoami( 81 | self, 82 | token, # type: str 83 | ): 84 | # type: (...) -> str 85 | """ 86 | Call HF API to know "whoami" 87 | """ 88 | path = "{}/api/whoami".format(self.endpoint) 89 | r = requests.get(path, headers={"authorization": "Bearer {}".format(token)}) 90 | r.raise_for_status() 91 | d = r.json() 92 | return d["user"] 93 | 94 | def logout(self, token): 95 | # type: (...) -> void 96 | """ 97 | Call HF API to log out. 98 | """ 99 | path = "{}/api/logout".format(self.endpoint) 100 | r = requests.post(path, headers={"authorization": "Bearer {}".format(token)}) 101 | r.raise_for_status() 102 | 103 | def presign(self, token, filename): 104 | # type: (...) -> PresignedUrl 105 | """ 106 | Call HF API to get a presigned url to upload `filename` to S3. 107 | """ 108 | path = "{}/api/presign".format(self.endpoint) 109 | r = requests.post( 110 | path, 111 | headers={"authorization": "Bearer {}".format(token)}, 112 | json={"filename": filename}, 113 | ) 114 | r.raise_for_status() 115 | d = r.json() 116 | return PresignedUrl(**d) 117 | 118 | def presign_and_upload(self, token, filename, filepath): 119 | # type: (...) -> str 120 | """ 121 | Get a presigned url, then upload file to S3. 122 | 123 | Outputs: 124 | url: Read-only url for the stored file on S3. 125 | """ 126 | urls = self.presign(token, filename=filename) 127 | # streaming upload: 128 | # https://2.python-requests.org/en/master/user/advanced/#streaming-uploads 129 | # 130 | # Even though we presign with the correct content-type, 131 | # the client still has to specify it when uploading the file. 132 | with open(filepath, "rb") as f: 133 | pf = TqdmProgressFileReader(f) 134 | 135 | r = requests.put(urls.write, data=f, headers={ 136 | "content-type": urls.type, 137 | }) 138 | r.raise_for_status() 139 | pf.close() 140 | return urls.access 141 | 142 | def list_objs(self, token): 143 | # type: (...) -> List[S3Obj] 144 | """ 145 | Call HF API to list all stored files for user. 146 | """ 147 | path = "{}/api/listObjs".format(self.endpoint) 148 | r = requests.get(path, headers={"authorization": "Bearer {}".format(token)}) 149 | r.raise_for_status() 150 | d = r.json() 151 | return [S3Obj(**x) for x in d] 152 | 153 | 154 | 155 | class TqdmProgressFileReader: 156 | """ 157 | Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`) 158 | and override `f.read()` so as to display a tqdm progress bar. 159 | 160 | see github.com/huggingface/transformers/pull/2078#discussion_r354739608 161 | for implementation details. 162 | """ 163 | def __init__( 164 | self, 165 | f # type: io.BufferedReader 166 | ): 167 | self.f = f 168 | self.total_size = os.fstat(f.fileno()).st_size # type: int 169 | self.pbar = tqdm(total=self.total_size, leave=False) 170 | if six.PY3: 171 | # does not work unless PY3 172 | # no big deal as the CLI does not currently support PY2 anyways. 173 | self.read = f.read 174 | f.read = self._read 175 | 176 | def _read(self, n=-1): 177 | self.pbar.update(n) 178 | return self.read(n) 179 | 180 | def close(self): 181 | self.pbar.close() 182 | 183 | 184 | 185 | class HfFolder: 186 | path_token = expanduser("~/.huggingface/token") 187 | 188 | @classmethod 189 | def save_token(cls, token): 190 | """ 191 | Save token, creating folder as needed. 192 | """ 193 | if six.PY3: 194 | os.makedirs(os.path.dirname(cls.path_token), exist_ok=True) 195 | else: 196 | # Python 2 197 | try: 198 | os.makedirs(os.path.dirname(cls.path_token)) 199 | except OSError as e: 200 | if e.errno != os.errno.EEXIST: 201 | raise e 202 | pass 203 | with open(cls.path_token, 'w+') as f: 204 | f.write(token) 205 | 206 | @classmethod 207 | def get_token(cls): 208 | """ 209 | Get token or None if not existent. 210 | """ 211 | try: 212 | with open(cls.path_token, 'r') as f: 213 | return f.read() 214 | except: 215 | # this is too wide. When Py2 is dead use: 216 | # `except FileNotFoundError:` instead 217 | return None 218 | 219 | @classmethod 220 | def delete_token(cls): 221 | """ 222 | Delete token. 223 | Do not fail if token does not exist. 224 | """ 225 | try: 226 | os.remove(cls.path_token) 227 | except: 228 | return 229 | -------------------------------------------------------------------------------- /transformers/configuration_bert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ BERT model configuration """ 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | import json 21 | import logging 22 | import sys 23 | from io import open 24 | 25 | from .configuration_utils import PretrainedConfig 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 30 | 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", 31 | 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", 32 | 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", 33 | 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", 34 | 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", 35 | 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", 36 | 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", 37 | 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", 38 | 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", 39 | 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", 40 | 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", 41 | 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", 42 | 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", 43 | 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", 44 | 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", 45 | 'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json", 46 | 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json", 47 | 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json", 48 | 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json" 49 | } 50 | 51 | 52 | class BertConfig(PretrainedConfig): 53 | r""" 54 | :class:`~transformers.BertConfig` is the configuration class to store the configuration of a 55 | `BertModel`. 56 | 57 | 58 | Arguments: 59 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. 60 | hidden_size: Size of the encoder layers and the pooler layer. 61 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 62 | num_attention_heads: Number of attention heads for each attention layer in 63 | the Transformer encoder. 64 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 65 | layer in the Transformer encoder. 66 | hidden_act: The non-linear activation function (function or string) in the 67 | encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. 68 | hidden_dropout_prob: The dropout probabilitiy for all fully connected 69 | layers in the embeddings, encoder, and pooler. 70 | attention_probs_dropout_prob: The dropout ratio for the attention 71 | probabilities. 72 | max_position_embeddings: The maximum sequence length that this model might 73 | ever be used with. Typically set this to something large just in case 74 | (e.g., 512 or 1024 or 2048). 75 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 76 | `BertModel`. 77 | initializer_range: The sttdev of the truncated_normal_initializer for 78 | initializing all weight matrices. 79 | layer_norm_eps: The epsilon used by LayerNorm. 80 | """ 81 | pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP 82 | 83 | def __init__(self, 84 | vocab_size_or_config_json_file=30522, 85 | hidden_size=768, 86 | num_hidden_layers=12, 87 | num_attention_heads=12, 88 | intermediate_size=3072, 89 | hidden_act="gelu", 90 | hidden_dropout_prob=0.1, 91 | attention_probs_dropout_prob=0.1, 92 | max_position_embeddings=512, 93 | type_vocab_size=2, 94 | initializer_range=0.02, 95 | layer_norm_eps=1e-12, 96 | **kwargs): 97 | super(BertConfig, self).__init__(**kwargs) 98 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 99 | and isinstance(vocab_size_or_config_json_file, unicode)): 100 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 101 | json_config = json.loads(reader.read()) 102 | for key, value in json_config.items(): 103 | self.__dict__[key] = value 104 | elif isinstance(vocab_size_or_config_json_file, int): 105 | self.vocab_size = vocab_size_or_config_json_file 106 | self.hidden_size = hidden_size 107 | self.num_hidden_layers = num_hidden_layers 108 | self.num_attention_heads = num_attention_heads 109 | self.hidden_act = hidden_act 110 | self.intermediate_size = intermediate_size 111 | self.hidden_dropout_prob = hidden_dropout_prob 112 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 113 | self.max_position_embeddings = max_position_embeddings 114 | self.type_vocab_size = type_vocab_size 115 | self.initializer_range = initializer_range 116 | self.layer_norm_eps = layer_norm_eps 117 | else: 118 | raise ValueError("First argument must be either a vocabulary size (int)" 119 | " or the path to a pretrained model config file (str)") 120 | -------------------------------------------------------------------------------- /transformers/tokenization_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for RoBERTa.""" 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import sys 20 | import json 21 | import logging 22 | import os 23 | import regex as re 24 | from io import open 25 | 26 | from .tokenization_gpt2 import GPT2Tokenizer 27 | 28 | try: 29 | from functools import lru_cache 30 | except ImportError: 31 | # Just a dummy decorator to get the checks to run on python2 32 | # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now. 33 | def lru_cache(): 34 | return lambda func: func 35 | 36 | logger = logging.getLogger(__name__) 37 | 38 | VOCAB_FILES_NAMES = { 39 | 'vocab_file': 'vocab.json', 40 | 'merges_file': 'merges.txt', 41 | } 42 | 43 | PRETRAINED_VOCAB_FILES_MAP = { 44 | 'vocab_file': 45 | { 46 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", 47 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", 48 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json", 49 | 'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json", 50 | 'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", 51 | 'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", 52 | }, 53 | 'merges_file': 54 | { 55 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", 56 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", 57 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt", 58 | 'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt", 59 | 'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", 60 | 'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", 61 | }, 62 | } 63 | 64 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 65 | 'roberta-base': 512, 66 | 'roberta-large': 512, 67 | 'roberta-large-mnli': 512, 68 | 'distilroberta-base': 512, 69 | 'roberta-base-openai-detector': 512, 70 | 'roberta-large-openai-detector': 512, 71 | } 72 | 73 | 74 | class RobertaTokenizer(GPT2Tokenizer): 75 | """ 76 | RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: 77 | - Byte-level Byte-Pair-Encoding 78 | - Requires a space to start the input string => the encoding methods should be called with the 79 | ``add_prefix_space`` flag set to ``True``. 80 | Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve 81 | the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"` 82 | """ 83 | vocab_files_names = VOCAB_FILES_NAMES 84 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 85 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 86 | 87 | def __init__(self, vocab_file, merges_file, errors='replace', bos_token="", eos_token="", sep_token="", 88 | cls_token="", unk_token="", pad_token='', mask_token='', **kwargs): 89 | super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors, 90 | bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, 91 | sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, 92 | mask_token=mask_token, **kwargs) 93 | self.max_len_single_sentence = self.max_len - 2 # take into account special tokens 94 | self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens 95 | 96 | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): 97 | """ 98 | Build model inputs from a sequence or a pair of sequence for sequence classification tasks 99 | by concatenating and adding special tokens. 100 | A RoBERTa sequence has the following format: 101 | single sequence: X 102 | pair of sequences: A B 103 | """ 104 | if token_ids_1 is None: 105 | return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] 106 | cls = [self.cls_token_id] 107 | sep = [self.sep_token_id] 108 | return cls + token_ids_0 + sep + sep + token_ids_1 + sep 109 | 110 | def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): 111 | """ 112 | Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding 113 | special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. 114 | 115 | Args: 116 | token_ids_0: list of ids (must not contain special tokens) 117 | token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids 118 | for sequence pairs 119 | already_has_special_tokens: (default False) Set to True if the token list is already formated with 120 | special tokens for the model 121 | 122 | Returns: 123 | A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. 124 | """ 125 | if already_has_special_tokens: 126 | if token_ids_1 is not None: 127 | raise ValueError("You should not supply a second sequence if the provided sequence of " 128 | "ids is already formated with special tokens for the model.") 129 | return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) 130 | 131 | if token_ids_1 is None: 132 | return [1] + ([0] * len(token_ids_0)) + [1] 133 | return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] 134 | 135 | def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): 136 | """ 137 | Creates a mask from the two sequences passed to be used in a sequence-pair classification task. 138 | A RoBERTa sequence pair mask has the following format: 139 | 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 140 | | first sequence | second sequence 141 | 142 | if token_ids_1 is None, only returns the first portion of the mask (0's). 143 | """ 144 | sep = [self.sep_token_id] 145 | cls = [self.cls_token_id] 146 | 147 | if token_ids_1 is None: 148 | return len(cls + token_ids_0 + sep) * [0] 149 | return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1] 150 | -------------------------------------------------------------------------------- /transformers/tokenization_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License 15 | """ Tokenization classes for Camembert model.""" 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import logging 20 | import os 21 | from shutil import copyfile 22 | 23 | import sentencepiece as spm 24 | from transformers.tokenization_utils import PreTrainedTokenizer 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'} 29 | 30 | PRETRAINED_VOCAB_FILES_MAP = { 31 | 'vocab_file': 32 | { 33 | 'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model", 34 | } 35 | } 36 | 37 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 38 | 'camembert-base': None, 39 | } 40 | 41 | class CamembertTokenizer(PreTrainedTokenizer): 42 | """ 43 | Adapted from RobertaTokenizer and XLNetTokenizer 44 | SentencePiece based tokenizer. Peculiarities: 45 | 46 | - requires `SentencePiece `_ 47 | """ 48 | vocab_files_names = VOCAB_FILES_NAMES 49 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 50 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 51 | 52 | def __init__(self, vocab_file, bos_token="", eos_token="", sep_token="", 53 | cls_token="", unk_token="", pad_token='', mask_token='', 54 | additional_special_tokens=['NOTUSED', 'NOTUSED'], **kwargs): 55 | super(CamembertTokenizer, self).__init__(max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, 56 | sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, 57 | mask_token=mask_token, additional_special_tokens=additional_special_tokens, 58 | **kwargs) 59 | self.max_len_single_sentence = self.max_len - 2 # take into account special tokens 60 | self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens 61 | self.sp_model = spm.SentencePieceProcessor() 62 | self.sp_model.Load(str(vocab_file)) 63 | self.vocab_file = vocab_file 64 | # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual 65 | # sentencepiece vocabulary (this is the case for and 66 | self.fairseq_tokens_to_ids = {'NOTUSED': 0, '': 1, 'NOTUSED': 2, '': 3} 67 | self.fairseq_offset = len(self.fairseq_tokens_to_ids) 68 | self.fairseq_tokens_to_ids[''] = len(self.sp_model) + len(self.fairseq_tokens_to_ids) 69 | self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} 70 | 71 | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): 72 | """ 73 | Build model inputs from a sequence or a pair of sequence for sequence classification tasks 74 | by concatenating and adding special tokens. 75 | A RoBERTa sequence has the following format: 76 | single sequence: X 77 | pair of sequences: A B 78 | """ 79 | if token_ids_1 is None: 80 | return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] 81 | cls = [self.cls_token_id] 82 | sep = [self.sep_token_id] 83 | return cls + token_ids_0 + sep + sep + token_ids_1 + sep 84 | 85 | def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): 86 | """ 87 | Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding 88 | special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. 89 | 90 | Args: 91 | token_ids_0: list of ids (must not contain special tokens) 92 | token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids 93 | for sequence pairs 94 | already_has_special_tokens: (default False) Set to True if the token list is already formated with 95 | special tokens for the model 96 | 97 | Returns: 98 | A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. 99 | """ 100 | if already_has_special_tokens: 101 | if token_ids_1 is not None: 102 | raise ValueError("You should not supply a second sequence if the provided sequence of " 103 | "ids is already formated with special tokens for the model.") 104 | return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) 105 | 106 | if token_ids_1 is None: 107 | return [1] + ([0] * len(token_ids_0)) + [1] 108 | return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] 109 | 110 | def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): 111 | """ 112 | Creates a mask from the two sequences passed to be used in a sequence-pair classification task. 113 | A RoBERTa sequence pair mask has the following format: 114 | 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 115 | | first sequence | second sequence 116 | 117 | if token_ids_1 is None, only returns the first portion of the mask (0's). 118 | """ 119 | sep = [self.sep_token_id] 120 | cls = [self.cls_token_id] 121 | 122 | if token_ids_1 is None: 123 | return len(cls + token_ids_0 + sep) * [0] 124 | return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1] 125 | 126 | @property 127 | def vocab_size(self): 128 | return len(self.fairseq_tokens_to_ids) + len(self.sp_model) 129 | 130 | def _tokenize(self, text): 131 | return self.sp_model.EncodeAsPieces(text) 132 | 133 | def _convert_token_to_id(self, token): 134 | """ Converts a token (str/unicode) in an id using the vocab. """ 135 | if token in self.fairseq_tokens_to_ids: 136 | return self.fairseq_tokens_to_ids[token] 137 | elif self.sp_model.PieceToId(token) == 0: 138 | # Convert sentence piece unk token to fairseq unk token index 139 | return self.unk_token_id 140 | return self.fairseq_offset + self.sp_model.PieceToId(token) 141 | 142 | def _convert_id_to_token(self, index): 143 | """Converts an index (integer) in a token (string/unicode) using the vocab.""" 144 | if index in self.fairseq_ids_to_tokens: 145 | return self.fairseq_ids_to_tokens[index] 146 | return self.sp_model.IdToPiece(index - self.fairseq_offset) 147 | 148 | def save_vocabulary(self, save_directory): 149 | """ Save the sentencepiece vocabulary (copy original file) and special tokens file 150 | to a directory. 151 | """ 152 | if not os.path.isdir(save_directory): 153 | logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) 154 | return 155 | out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) 156 | 157 | if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): 158 | copyfile(self.vocab_file, out_vocab_file) 159 | 160 | return (out_vocab_file,) 161 | -------------------------------------------------------------------------------- /transformers/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch optimization for BERT model.""" 16 | 17 | import logging 18 | import math 19 | 20 | import torch 21 | from torch.optim import Optimizer 22 | from torch.optim.lr_scheduler import LambdaLR 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | def get_constant_schedule(optimizer, last_epoch=-1): 28 | """ Create a schedule with a constant learning rate. 29 | """ 30 | return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch) 31 | 32 | 33 | def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1): 34 | """ Create a schedule with a constant learning rate preceded by a warmup 35 | period during which the learning rate increases linearly between 0 and 1. 36 | """ 37 | def lr_lambda(current_step): 38 | if current_step < num_warmup_steps: 39 | return float(current_step) / float(max(1.0, num_warmup_steps)) 40 | return 1. 41 | 42 | return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch) 43 | 44 | 45 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): 46 | """ Create a schedule with a learning rate that decreases linearly after 47 | linearly increasing during a warmup period. 48 | """ 49 | def lr_lambda(current_step): 50 | if current_step < num_warmup_steps: 51 | return float(current_step) / float(max(1, num_warmup_steps)) 52 | return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))) 53 | 54 | return LambdaLR(optimizer, lr_lambda, last_epoch) 55 | 56 | 57 | def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1): 58 | """ Create a schedule with a learning rate that decreases following the 59 | values of the cosine function between 0 and `pi * cycles` after a warmup 60 | period during which it increases linearly between 0 and 1. 61 | """ 62 | def lr_lambda(current_step): 63 | if current_step < num_warmup_steps: 64 | return float(current_step) / float(max(1, num_warmup_steps)) 65 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) 66 | return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress))) 67 | 68 | return LambdaLR(optimizer, lr_lambda, last_epoch) 69 | 70 | 71 | def get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=1., last_epoch=-1): 72 | """ Create a schedule with a learning rate that decreases following the 73 | values of the cosine function with several hard restarts, after a warmup 74 | period during which it increases linearly between 0 and 1. 75 | """ 76 | def lr_lambda(current_step): 77 | if current_step < num_warmup_steps: 78 | return float(current_step) / float(max(1, num_warmup_steps)) 79 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) 80 | if progress >= 1.: 81 | return 0. 82 | return max(0., 0.5 * (1. + math.cos(math.pi * ((float(num_cycles) * progress) % 1.)))) 83 | 84 | return LambdaLR(optimizer, lr_lambda, last_epoch) 85 | 86 | 87 | class AdamW(Optimizer): 88 | """ Implements Adam algorithm with weight decay fix. 89 | 90 | Parameters: 91 | lr (float): learning rate. Default 1e-3. 92 | betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999) 93 | eps (float): Adams epsilon. Default: 1e-6 94 | weight_decay (float): Weight decay. Default: 0.0 95 | correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True. 96 | """ 97 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True): 98 | if lr < 0.0: 99 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 100 | if not 0.0 <= betas[0] < 1.0: 101 | raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0])) 102 | if not 0.0 <= betas[1] < 1.0: 103 | raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1])) 104 | if not 0.0 <= eps: 105 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps)) 106 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, 107 | correct_bias=correct_bias) 108 | super(AdamW, self).__init__(params, defaults) 109 | 110 | def step(self, closure=None): 111 | """Performs a single optimization step. 112 | 113 | Arguments: 114 | closure (callable, optional): A closure that reevaluates the model 115 | and returns the loss. 116 | """ 117 | loss = None 118 | if closure is not None: 119 | loss = closure() 120 | 121 | for group in self.param_groups: 122 | for p in group['params']: 123 | if p.grad is None: 124 | continue 125 | grad = p.grad.data 126 | if grad.is_sparse: 127 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 128 | 129 | state = self.state[p] 130 | 131 | # State initialization 132 | if len(state) == 0: 133 | state['step'] = 0 134 | # Exponential moving average of gradient values 135 | state['exp_avg'] = torch.zeros_like(p.data) 136 | # Exponential moving average of squared gradient values 137 | state['exp_avg_sq'] = torch.zeros_like(p.data) 138 | 139 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 140 | beta1, beta2 = group['betas'] 141 | 142 | state['step'] += 1 143 | 144 | # Decay the first and second moment running average coefficient 145 | # In-place operations to update the averages at the same time 146 | exp_avg.mul_(beta1).add_(1.0 - beta1, grad) 147 | exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad) 148 | denom = exp_avg_sq.sqrt().add_(group['eps']) 149 | 150 | step_size = group['lr'] 151 | if group['correct_bias']: # No bias correction for Bert 152 | bias_correction1 = 1.0 - beta1 ** state['step'] 153 | bias_correction2 = 1.0 - beta2 ** state['step'] 154 | step_size = step_size * math.sqrt(bias_correction2) / bias_correction1 155 | 156 | p.data.addcdiv_(-step_size, exp_avg, denom) 157 | 158 | # Just adding the square of the weights to the loss function is *not* 159 | # the correct way of using L2 regularization/weight decay with Adam, 160 | # since that will interact with the m and v parameters in strange ways. 161 | # 162 | # Instead we want to decay the weights in a manner that doesn't interact 163 | # with the m/v parameters. This is equivalent to adding the square 164 | # of the weights to the loss with plain (non-momentum) SGD. 165 | # Add weight decay at the end (fixed version) 166 | if group['weight_decay'] > 0.0: 167 | p.data.add_(-group['lr'] * group['weight_decay'], p.data) 168 | 169 | return loss 170 | -------------------------------------------------------------------------------- /transformers/tokenization_openai.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for OpenAI GPT.""" 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import json 20 | import logging 21 | import os 22 | import re 23 | from io import open 24 | 25 | from .tokenization_utils import PreTrainedTokenizer 26 | from .tokenization_bert import BasicTokenizer 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | VOCAB_FILES_NAMES = { 31 | 'vocab_file': 'vocab.json', 32 | 'merges_file': 'merges.txt', 33 | } 34 | 35 | PRETRAINED_VOCAB_FILES_MAP = { 36 | 'vocab_file': 37 | { 38 | 'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json", 39 | }, 40 | 'merges_file': 41 | { 42 | 'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt", 43 | }, 44 | } 45 | 46 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 47 | 'openai-gpt': 512, 48 | } 49 | 50 | def get_pairs(word): 51 | """ 52 | Return set of symbol pairs in a word. 53 | word is represented as tuple of symbols (symbols being variable-length strings) 54 | """ 55 | pairs = set() 56 | prev_char = word[0] 57 | for char in word[1:]: 58 | pairs.add((prev_char, char)) 59 | prev_char = char 60 | return pairs 61 | 62 | def text_standardize(text): 63 | """ 64 | fixes some issues the spacy tokenizer had on books corpus 65 | also does some whitespace standardization 66 | """ 67 | text = text.replace('—', '-') 68 | text = text.replace('–', '-') 69 | text = text.replace('―', '-') 70 | text = text.replace('…', '...') 71 | text = text.replace('´', "'") 72 | text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text) 73 | text = re.sub(r'\s*\n\s*', ' \n ', text) 74 | text = re.sub(r'[^\S\n]+', ' ', text) 75 | return text.strip() 76 | 77 | class OpenAIGPTTokenizer(PreTrainedTokenizer): 78 | """ 79 | BPE tokenizer. Peculiarities: 80 | - lower case all inputs 81 | - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. 82 | """ 83 | vocab_files_names = VOCAB_FILES_NAMES 84 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 85 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 86 | 87 | def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): 88 | super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs) 89 | 90 | self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens 91 | self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens 92 | 93 | try: 94 | import ftfy 95 | from spacy.lang.en import English 96 | _nlp = English() 97 | self.nlp = _nlp.Defaults.create_tokenizer(_nlp) 98 | self.fix_text = ftfy.fix_text 99 | except ImportError: 100 | logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") 101 | self.nlp = BasicTokenizer(do_lower_case=True) 102 | self.fix_text = None 103 | 104 | with open(vocab_file, encoding="utf-8") as vocab_handle: 105 | self.encoder = json.load(vocab_handle) 106 | self.decoder = {v:k for k,v in self.encoder.items()} 107 | with open(merges_file, encoding='utf-8') as merges_handle: 108 | merges = merges_handle.read().split('\n')[1:-1] 109 | merges = [tuple(merge.split()) for merge in merges] 110 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 111 | self.cache = {} 112 | 113 | @property 114 | def vocab_size(self): 115 | return len(self.encoder) 116 | 117 | def bpe(self, token): 118 | word = tuple(token[:-1]) + (token[-1] + '',) 119 | if token in self.cache: 120 | return self.cache[token] 121 | pairs = get_pairs(word) 122 | 123 | if not pairs: 124 | return token+'' 125 | 126 | while True: 127 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) 128 | if bigram not in self.bpe_ranks: 129 | break 130 | first, second = bigram 131 | new_word = [] 132 | i = 0 133 | while i < len(word): 134 | try: 135 | j = word.index(first, i) 136 | new_word.extend(word[i:j]) 137 | i = j 138 | except: 139 | new_word.extend(word[i:]) 140 | break 141 | 142 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 143 | new_word.append(first+second) 144 | i += 2 145 | else: 146 | new_word.append(word[i]) 147 | i += 1 148 | new_word = tuple(new_word) 149 | word = new_word 150 | if len(word) == 1: 151 | break 152 | else: 153 | pairs = get_pairs(word) 154 | word = ' '.join(word) 155 | if word == '\n ': 156 | word = '\n' 157 | self.cache[token] = word 158 | return word 159 | 160 | def _tokenize(self, text): 161 | """ Tokenize a string. """ 162 | split_tokens = [] 163 | if self.fix_text is None: 164 | # Using BERT's BasicTokenizer 165 | text = self.nlp.tokenize(text) 166 | for token in text: 167 | split_tokens.extend([t for t in self.bpe(token).split(' ')]) 168 | else: 169 | # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) 170 | text = self.nlp(text_standardize(self.fix_text(text))) 171 | for token in text: 172 | split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')]) 173 | return split_tokens 174 | 175 | def _convert_token_to_id(self, token): 176 | """ Converts a token (str/unicode) in an id using the vocab. """ 177 | return self.encoder.get(token, self.encoder.get(self.unk_token)) 178 | 179 | def _convert_id_to_token(self, index): 180 | """Converts an id in a token (BPE) using the vocab.""" 181 | return self.decoder.get(index, self.unk_token) 182 | 183 | def convert_tokens_to_string(self, tokens): 184 | """ Converts a sequence of tokens (string) in a single string. """ 185 | out_string = ''.join(tokens).replace('', ' ').strip() 186 | return out_string 187 | 188 | def save_vocabulary(self, save_directory): 189 | """Save the tokenizer vocabulary and merge files to a directory.""" 190 | if not os.path.isdir(save_directory): 191 | logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) 192 | return 193 | vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) 194 | merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file']) 195 | 196 | with open(vocab_file, 'w', encoding='utf-8') as f: 197 | f.write(json.dumps(self.encoder, ensure_ascii=False)) 198 | 199 | index = 0 200 | with open(merge_file, "w", encoding="utf-8") as writer: 201 | writer.write(u'#version: 0.2\n') 202 | for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): 203 | if index != token_index: 204 | logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." 205 | " Please check that the tokenizer is not corrupted!".format(merge_file)) 206 | index = token_index 207 | writer.write(' '.join(bpe_tokens) + u'\n') 208 | index += 1 209 | 210 | return vocab_file, merge_file 211 | -------------------------------------------------------------------------------- /transformers/configuration_xlm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ XLM configuration """ 16 | from __future__ import absolute_import, division, print_function, unicode_literals 17 | 18 | import json 19 | import logging 20 | import sys 21 | from io import open 22 | 23 | from .configuration_utils import PretrainedConfig 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { 28 | 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json", 29 | 'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json", 30 | 'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json", 31 | 'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json", 32 | 'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json", 33 | 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json", 34 | 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json", 35 | 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json", 36 | 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json", 37 | 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json", 38 | } 39 | 40 | 41 | class XLMConfig(PretrainedConfig): 42 | """Configuration class to store the configuration of a `XLMModel`. 43 | 44 | Args: 45 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`. 46 | d_model: Size of the encoder layers and the pooler layer. 47 | n_layer: Number of hidden layers in the Transformer encoder. 48 | n_head: Number of attention heads for each attention layer in 49 | the Transformer encoder. 50 | d_inner: The size of the "intermediate" (i.e., feed-forward) 51 | layer in the Transformer encoder. 52 | ff_activation: The non-linear activation function (function or string) in the 53 | encoder and pooler. If string, "gelu", "relu" and "swish" are supported. 54 | untie_r: untie relative position biases 55 | attn_type: 'bi' for XLM, 'uni' for Transformer-XL 56 | 57 | dropout: The dropout probabilitiy for all fully connected 58 | layers in the embeddings, encoder, and pooler. 59 | max_position_embeddings: The maximum sequence length that this model might 60 | ever be used with. Typically set this to something large just in case 61 | (e.g., 512 or 1024 or 2048). 62 | initializer_range: The sttdev of the truncated_normal_initializer for 63 | initializing all weight matrices. 64 | layer_norm_eps: The epsilon used by LayerNorm. 65 | 66 | dropout: float, dropout rate. 67 | init: str, the initialization scheme, either "normal" or "uniform". 68 | init_range: float, initialize the parameters with a uniform distribution 69 | in [-init_range, init_range]. Only effective when init="uniform". 70 | init_std: float, initialize the parameters with a normal distribution 71 | with mean 0 and stddev init_std. Only effective when init="normal". 72 | mem_len: int, the number of tokens to cache. 73 | reuse_len: int, the number of tokens in the currect batch to be cached 74 | and reused in the future. 75 | bi_data: bool, whether to use bidirectional input pipeline. 76 | Usually set to True during pretraining and False during finetuning. 77 | clamp_len: int, clamp all relative distances larger than clamp_len. 78 | -1 means no clamping. 79 | same_length: bool, whether to use the same attention length for each token. 80 | """ 81 | pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP 82 | 83 | def __init__(self, 84 | vocab_size_or_config_json_file=30145, 85 | emb_dim=2048, 86 | n_layers=12, 87 | n_heads=16, 88 | dropout=0.1, 89 | attention_dropout=0.1, 90 | gelu_activation=True, 91 | sinusoidal_embeddings=False, 92 | causal=False, 93 | asm=False, 94 | n_langs=1, 95 | use_lang_emb=True, 96 | max_position_embeddings=512, 97 | embed_init_std=2048 ** -0.5, 98 | layer_norm_eps=1e-12, 99 | init_std=0.02, 100 | bos_index=0, 101 | eos_index=1, 102 | pad_index=2, 103 | unk_index=3, 104 | mask_index=5, 105 | is_encoder=True, 106 | 107 | finetuning_task=None, 108 | num_labels=2, 109 | summary_type='first', 110 | summary_use_proj=True, 111 | summary_activation=None, 112 | summary_proj_to_labels=True, 113 | summary_first_dropout=0.1, 114 | start_n_top=5, 115 | end_n_top=5, 116 | **kwargs): 117 | """Constructs XLMConfig. 118 | """ 119 | super(XLMConfig, self).__init__(**kwargs) 120 | 121 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 122 | and isinstance(vocab_size_or_config_json_file, unicode)): 123 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 124 | json_config = json.loads(reader.read()) 125 | for key, value in json_config.items(): 126 | self.__dict__[key] = value 127 | elif isinstance(vocab_size_or_config_json_file, int): 128 | self.n_words = vocab_size_or_config_json_file 129 | self.emb_dim = emb_dim 130 | self.n_layers = n_layers 131 | self.n_heads = n_heads 132 | self.dropout = dropout 133 | self.attention_dropout = attention_dropout 134 | self.gelu_activation = gelu_activation 135 | self.sinusoidal_embeddings = sinusoidal_embeddings 136 | self.causal = causal 137 | self.asm = asm 138 | self.n_langs = n_langs 139 | self.use_lang_emb = use_lang_emb 140 | self.layer_norm_eps = layer_norm_eps 141 | self.bos_index = bos_index 142 | self.eos_index = eos_index 143 | self.pad_index = pad_index 144 | self.unk_index = unk_index 145 | self.mask_index = mask_index 146 | self.is_encoder = is_encoder 147 | self.max_position_embeddings = max_position_embeddings 148 | self.embed_init_std = embed_init_std 149 | self.init_std = init_std 150 | self.finetuning_task = finetuning_task 151 | self.num_labels = num_labels 152 | self.summary_type = summary_type 153 | self.summary_use_proj = summary_use_proj 154 | self.summary_activation = summary_activation 155 | self.summary_proj_to_labels = summary_proj_to_labels 156 | self.summary_first_dropout = summary_first_dropout 157 | self.start_n_top = start_n_top 158 | self.end_n_top = end_n_top 159 | else: 160 | raise ValueError("First argument must be either a vocabulary size (int)" 161 | " or the path to a pretrained model config file (str)") 162 | 163 | @property 164 | def vocab_size(self): 165 | return self.n_words 166 | 167 | @vocab_size.setter 168 | def vocab_size(self, value): 169 | self.n_words = value 170 | 171 | @property 172 | def hidden_size(self): 173 | return self.emb_dim 174 | 175 | @property 176 | def num_attention_heads(self): 177 | return self.n_heads 178 | 179 | @property 180 | def num_hidden_layers(self): 181 | return self.n_layers 182 | -------------------------------------------------------------------------------- /transformers/modeling_tf_transfo_xl_utilities.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ A TF 2.0 Adaptive Softmax for Transformer XL model. 17 | """ 18 | 19 | from collections import defaultdict 20 | 21 | import numpy as np 22 | 23 | import tensorflow as tf 24 | 25 | from .modeling_tf_utils import shape_list 26 | 27 | class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): 28 | def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, 29 | keep_order=False, **kwargs): 30 | super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs) 31 | 32 | self.n_token = n_token 33 | self.d_embed = d_embed 34 | self.d_proj = d_proj 35 | 36 | self.cutoffs = cutoffs + [n_token] 37 | self.cutoff_ends = [0] + self.cutoffs 38 | self.div_val = div_val 39 | 40 | self.shortlist_size = self.cutoffs[0] 41 | self.n_clusters = len(self.cutoffs) - 1 42 | self.head_size = self.shortlist_size + self.n_clusters 43 | self.keep_order = keep_order 44 | 45 | self.out_layers = [] 46 | self.out_projs = [] 47 | 48 | def build(self, input_shape): 49 | if self.n_clusters > 0: 50 | self.cluster_weight = self.add_weight(shape=(self.n_clusters, self.d_embed), 51 | initializer='zeros', 52 | trainable=True, 53 | name='cluster_weight') 54 | self.cluster_bias = self.add_weight(shape=(self.n_clusters,), 55 | initializer='zeros', 56 | trainable=True, 57 | name='cluster_bias') 58 | 59 | if self.div_val == 1: 60 | for i in range(len(self.cutoffs)): 61 | if self.d_proj != self.d_embed: 62 | weight = self.add_weight(shape=(self.d_embed, self.d_proj), 63 | initializer='zeros', 64 | trainable=True, 65 | name='out_projs_._{}'.format(i)) 66 | self.out_projs.append(weight) 67 | else: 68 | self.out_projs.append(None) 69 | weight = self.add_weight(shape=(self.n_token, self.d_embed,), 70 | initializer='zeros', 71 | trainable=True, 72 | name='out_layers_._{}_._weight'.format(i)) 73 | bias = self.add_weight(shape=(self.n_token,), 74 | initializer='zeros', 75 | trainable=True, 76 | name='out_layers_._{}_._bias'.format(i)) 77 | self.out_layers.append((weight, bias)) 78 | else: 79 | for i in range(len(self.cutoffs)): 80 | l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] 81 | d_emb_i = self.d_embed // (self.div_val ** i) 82 | 83 | weight = self.add_weight(shape=(d_emb_i, self.d_proj), 84 | initializer='zeros', 85 | trainable=True, 86 | name='out_projs_._{}'.format(i)) 87 | self.out_projs.append(weight) 88 | weight = self.add_weight(shape=(r_idx-l_idx, d_emb_i,), 89 | initializer='zeros', 90 | trainable=True, 91 | name='out_layers_._{}_._weight'.format(i)) 92 | bias = self.add_weight(shape=(r_idx-l_idx,), 93 | initializer='zeros', 94 | trainable=True, 95 | name='out_layers_._{}_._bias'.format(i)) 96 | self.out_layers.append((weight, bias)) 97 | super(TFAdaptiveSoftmaxMask, self).build(input_shape) 98 | 99 | @staticmethod 100 | def _logit(x, W, b, proj=None): 101 | y = x 102 | if proj is not None: 103 | y = tf.einsum('ibd,ed->ibe', y, proj) 104 | return tf.einsum('ibd,nd->ibn', y, W) + b 105 | 106 | @staticmethod 107 | def _gather_logprob(logprob, target): 108 | lp_size = shape_list(logprob) 109 | r = tf.range(lp_size[0]) 110 | idx = tf.stack([r, target], 1) 111 | return tf.gather_nd(logprob, idx) 112 | 113 | def call(self, inputs, return_mean=True, training=False): 114 | hidden, target = inputs 115 | head_logprob = 0 116 | if self.n_clusters == 0: 117 | softmax_b = tf.get_variable('bias', [n_token], initializer=tf.zeros_initializer()) 118 | output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0]) 119 | if target is not None: 120 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output) 121 | out = tf.nn.log_softmax(output, axis=-1) 122 | else: 123 | hidden_sizes = shape_list(hidden) 124 | out = [] 125 | loss = tf.zeros(hidden_sizes[:2], dtype=tf.float32) 126 | for i in range(len(self.cutoffs)): 127 | l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] 128 | if target is not None: 129 | mask = (target >= l_idx) & (target < r_idx) 130 | mask_idx = tf.where(mask) 131 | cur_target = tf.boolean_mask(target, mask) - l_idx 132 | 133 | if self.div_val == 1: 134 | cur_W = self.out_layers[0][0][l_idx:r_idx] 135 | cur_b = self.out_layers[0][1][l_idx:r_idx] 136 | else: 137 | cur_W = self.out_layers[i][0] 138 | cur_b = self.out_layers[i][1] 139 | 140 | if i == 0: 141 | cur_W = tf.concat([cur_W, self.cluster_weight], 0) 142 | cur_b = tf.concat([cur_b, self.cluster_bias], 0) 143 | 144 | head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0]) 145 | head_logprob = tf.nn.log_softmax(head_logit) 146 | out.append(head_logprob[..., :self.cutoffs[0]]) 147 | if target is not None: 148 | cur_head_logprob = tf.boolean_mask(head_logprob, mask) 149 | cur_logprob = self._gather_logprob(cur_head_logprob, cur_target) 150 | else: 151 | tail_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[i]) 152 | tail_logprob = tf.nn.log_softmax(tail_logit) 153 | cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster 154 | logprob_i = head_logprob[..., cluster_prob_idx, None] + tail_logprob 155 | out.append(logprob_i) 156 | if target is not None: 157 | cur_head_logprob = tf.boolean_mask(head_logprob, mask) 158 | cur_tail_logprob = tf.boolean_mask(tail_logprob, mask) 159 | cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target) 160 | cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1] 161 | if target is not None: 162 | loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(shape_list(loss), dtype=tf.int64)) 163 | out = tf.concat(out, axis=-1) 164 | 165 | if target is not None: 166 | if return_mean: 167 | loss = tf.reduce_mean(loss) 168 | # Add the training-time loss value to the layer using `self.add_loss()`. 169 | self.add_loss(loss) 170 | 171 | # Log the loss as a metric (we could log arbitrary metrics, 172 | # including different metrics for training and inference. 173 | self.add_metric(loss, name=self.name, aggregation='mean' if return_mean else '') 174 | 175 | return out 176 | -------------------------------------------------------------------------------- /transformers/tokenization_ctrl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Salesforce and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for Salesforce CTRL.""" 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import json 20 | import logging 21 | import os 22 | import regex as re 23 | from io import open 24 | 25 | from .tokenization_utils import PreTrainedTokenizer 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | VOCAB_FILES_NAMES = { 30 | 'vocab_file': 'vocab.json', 31 | 'merges_file': 'merges.txt', 32 | } 33 | 34 | PRETRAINED_VOCAB_FILES_MAP = { 35 | 'vocab_file': 36 | { 37 | 'ctrl': "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json", 38 | }, 39 | 'merges_file': 40 | { 41 | 'ctrl': "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt", 42 | }, 43 | } 44 | 45 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 46 | 'ctrl': 256, 47 | } 48 | 49 | CONTROL_CODES = { 50 | "Pregnancy": 168629, 51 | "Christianity": 7675, 52 | "Explain": 106423, 53 | "Fitness": 63440, 54 | "Saving": 63163, 55 | "Ask": 27171, 56 | "Ass": 95985, 57 | "Joke": 163509, 58 | "Questions": 45622, 59 | "Thoughts": 49605, 60 | "Retail": 52342, 61 | "Feminism": 164338, 62 | "Writing": 11992, 63 | "Atheism": 192263, 64 | "Netflix": 48616, 65 | "Computing": 39639, 66 | "Opinion": 43213, 67 | "Alone": 44967, 68 | "Funny": 58917, 69 | "Gaming": 40358, 70 | "Human": 4088, 71 | "India": 1331, 72 | "Joker": 77138, 73 | "Diet": 36206, 74 | "Legal": 11859, 75 | "Norman": 4939, 76 | "Tip": 72689, 77 | "Weight": 52343, 78 | "Movies": 46273, 79 | "Running": 23425, 80 | "Science": 2090, 81 | "Horror": 37793, 82 | "Confession": 60572, 83 | "Finance": 12250, 84 | "Politics": 16360, 85 | "Scary": 191985, 86 | "Support": 12654, 87 | "Technologies": 32516, 88 | "Teenage": 66160, 89 | "Event": 32769, 90 | "Learned": 67460, 91 | "Notion": 182770, 92 | "Wikipedia": 37583, 93 | "Books": 6665, 94 | "Extract": 76050, 95 | "Confessions": 102701, 96 | "Conspiracy": 75932, 97 | "Links": 63674, 98 | "Narcissus": 150425, 99 | "Relationship": 54766, 100 | "Relationships": 134796, 101 | "Reviews": 41671, 102 | "News": 4256, 103 | "Translation": 26820, 104 | "multilingual": 128406, 105 | } 106 | 107 | def get_pairs(word): 108 | """Return set of symbol pairs in a word. 109 | 110 | Word is represented as tuple of symbols (symbols being variable-length strings). 111 | """ 112 | pairs = set() 113 | prev_char = word[0] 114 | for char in word[1:]: 115 | pairs.add((prev_char, char)) 116 | prev_char = char 117 | 118 | pairs = set(pairs) 119 | return pairs 120 | 121 | class CTRLTokenizer(PreTrainedTokenizer): 122 | """ 123 | CTRL BPE tokenizer. Peculiarities: 124 | - Byte-Pair-Encoding 125 | """ 126 | vocab_files_names = VOCAB_FILES_NAMES 127 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 128 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 129 | control_codes = CONTROL_CODES 130 | 131 | def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): 132 | super(CTRLTokenizer, self).__init__(unk_token=unk_token, **kwargs) 133 | self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens 134 | self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens 135 | 136 | with open(vocab_file, encoding="utf-8") as vocab_handle: 137 | self.encoder = json.load(vocab_handle) 138 | self.decoder = {v:k for k,v in self.encoder.items()} 139 | with open(merges_file, encoding='utf-8') as merges_handle: 140 | merges = merges_handle.read().split('\n')[1:-1] 141 | merges = [tuple(merge.split()) for merge in merges] 142 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 143 | self.cache = {} 144 | 145 | @property 146 | def vocab_size(self): 147 | return len(self.encoder) 148 | 149 | def bpe(self, token): 150 | if token in self.cache: 151 | return self.cache[token] 152 | word = tuple(token) 153 | word = tuple(list(word[:-1]) + [word[-1]+'']) 154 | pairs = get_pairs(word) 155 | 156 | if not pairs: 157 | return token 158 | 159 | while True: 160 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 161 | if bigram not in self.bpe_ranks: 162 | break 163 | first, second = bigram 164 | new_word = [] 165 | i = 0 166 | while i < len(word): 167 | try: 168 | j = word.index(first, i) 169 | new_word.extend(word[i:j]) 170 | i = j 171 | except: 172 | new_word.extend(word[i:]) 173 | break 174 | 175 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 176 | new_word.append(first+second) 177 | i += 2 178 | else: 179 | new_word.append(word[i]) 180 | i += 1 181 | new_word = tuple(new_word) 182 | word = new_word 183 | if len(word) == 1: 184 | break 185 | else: 186 | pairs = get_pairs(word) 187 | word = '@@ '.join(word) 188 | word = word[:-4] 189 | self.cache[token] = word 190 | return word 191 | 192 | def _tokenize(self, text): 193 | """ Tokenize a string. 194 | """ 195 | split_tokens = [] 196 | 197 | words = re.findall(r'\S+\n?', text) 198 | 199 | for token in words: 200 | split_tokens.extend([t for t in self.bpe(token).split(' ')]) 201 | return split_tokens 202 | 203 | def _convert_token_to_id(self, token): 204 | """ Converts a token (str/unicode) in an id using the vocab. """ 205 | return self.encoder.get(token, self.encoder.get(self.unk_token)) 206 | 207 | def _convert_id_to_token(self, index): 208 | """Converts an index (integer) in a token (string/unicode) using the vocab.""" 209 | return self.decoder.get(index, self.unk_token) 210 | 211 | def convert_tokens_to_string(self, tokens): 212 | """ Converts a sequence of tokens (string) in a single string. """ 213 | out_string = ' '.join(tokens).replace('@@ ', '').strip() 214 | return out_string 215 | 216 | def save_vocabulary(self, save_directory): 217 | """Save the tokenizer vocabulary and merge files to a directory.""" 218 | if not os.path.isdir(save_directory): 219 | logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) 220 | return 221 | vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) 222 | merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file']) 223 | 224 | with open(vocab_file, 'w', encoding='utf-8') as f: 225 | f.write(json.dumps(self.encoder, ensure_ascii=False)) 226 | 227 | index = 0 228 | with open(merge_file, "w", encoding="utf-8") as writer: 229 | writer.write(u'#version: 0.2\n') 230 | for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): 231 | if index != token_index: 232 | logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." 233 | " Please check that the tokenizer is not corrupted!".format(merge_file)) 234 | index = token_index 235 | writer.write(' '.join(bpe_tokens) + u'\n') 236 | index += 1 237 | 238 | return vocab_file, merge_file 239 | 240 | # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): 241 | # filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)) 242 | # tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens) 243 | # tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far) 244 | # return ''.join(tokens_generated_so_far) 245 | -------------------------------------------------------------------------------- /transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert RoBERTa checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import logging 21 | import numpy as np 22 | import torch 23 | 24 | from fairseq.models.roberta import RobertaModel as FairseqRobertaModel 25 | from fairseq.modules import TransformerSentenceEncoderLayer 26 | from transformers.modeling_bert import (BertConfig, BertEncoder, 27 | BertIntermediate, BertLayer, 28 | BertModel, BertOutput, 29 | BertSelfAttention, 30 | BertSelfOutput) 31 | from transformers.modeling_roberta import (RobertaEmbeddings, 32 | RobertaForMaskedLM, 33 | RobertaForSequenceClassification, 34 | RobertaModel) 35 | 36 | logging.basicConfig(level=logging.INFO) 37 | logger = logging.getLogger(__name__) 38 | 39 | SAMPLE_TEXT = 'Hello world! cécé herlolip' 40 | 41 | 42 | def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head): 43 | """ 44 | Copy/paste/tweak roberta's weights to our BERT structure. 45 | """ 46 | roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) 47 | roberta.eval() # disable dropout 48 | config = BertConfig( 49 | vocab_size_or_config_json_file=50265, 50 | hidden_size=roberta.args.encoder_embed_dim, 51 | num_hidden_layers=roberta.args.encoder_layers, 52 | num_attention_heads=roberta.args.encoder_attention_heads, 53 | intermediate_size=roberta.args.encoder_ffn_embed_dim, 54 | max_position_embeddings=514, 55 | type_vocab_size=1, 56 | layer_norm_eps=1e-5, # PyTorch default used in fairseq 57 | ) 58 | if classification_head: 59 | config.num_labels = roberta.args.num_classes 60 | print("Our BERT config:", config) 61 | 62 | model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config) 63 | model.eval() 64 | 65 | # Now let's copy all the weights. 66 | # Embeddings 67 | roberta_sent_encoder = roberta.model.decoder.sentence_encoder 68 | model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight 69 | model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight 70 | model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them. 71 | model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight 72 | model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias 73 | 74 | for i in range(config.num_hidden_layers): 75 | # Encoder: start of layer 76 | layer: BertLayer = model.roberta.encoder.layer[i] 77 | roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i] 78 | 79 | ### self attention 80 | self_attn: BertSelfAttention = layer.attention.self 81 | assert( 82 | roberta_layer.self_attn.in_proj_weight.shape == torch.Size((3 * config.hidden_size, config.hidden_size)) 83 | ) 84 | # we use three distinct linear layers so we split the source layer here. 85 | self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[:config.hidden_size, :] 86 | self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[:config.hidden_size] 87 | self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[config.hidden_size:2*config.hidden_size, :] 88 | self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[config.hidden_size:2*config.hidden_size] 89 | self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[2*config.hidden_size:, :] 90 | self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[2*config.hidden_size:] 91 | 92 | ### self-attention output 93 | self_output: BertSelfOutput = layer.attention.output 94 | assert( 95 | self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape 96 | ) 97 | self_output.dense.weight = roberta_layer.self_attn.out_proj.weight 98 | self_output.dense.bias = roberta_layer.self_attn.out_proj.bias 99 | self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight 100 | self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias 101 | 102 | ### intermediate 103 | intermediate: BertIntermediate = layer.intermediate 104 | assert( 105 | intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape 106 | ) 107 | intermediate.dense.weight = roberta_layer.fc1.weight 108 | intermediate.dense.bias = roberta_layer.fc1.bias 109 | 110 | ### output 111 | bert_output: BertOutput = layer.output 112 | assert( 113 | bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape 114 | ) 115 | bert_output.dense.weight = roberta_layer.fc2.weight 116 | bert_output.dense.bias = roberta_layer.fc2.bias 117 | bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight 118 | bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias 119 | #### end of layer 120 | 121 | if classification_head: 122 | model.classifier.dense.weight = roberta.model.classification_heads['mnli'].dense.weight 123 | model.classifier.dense.bias = roberta.model.classification_heads['mnli'].dense.bias 124 | model.classifier.out_proj.weight = roberta.model.classification_heads['mnli'].out_proj.weight 125 | model.classifier.out_proj.bias = roberta.model.classification_heads['mnli'].out_proj.bias 126 | else: 127 | # LM Head 128 | model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight 129 | model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias 130 | model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight 131 | model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias 132 | model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight 133 | model.lm_head.bias = roberta.model.decoder.lm_head.bias 134 | 135 | # Let's check that we get the same results. 136 | input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 137 | 138 | our_output = model(input_ids)[0] 139 | if classification_head: 140 | their_output = roberta.model.classification_heads['mnli'](roberta.extract_features(input_ids)) 141 | else: 142 | their_output = roberta.model(input_ids)[0] 143 | print(our_output.shape, their_output.shape) 144 | max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() 145 | print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 146 | success = torch.allclose(our_output, their_output, atol=1e-3) 147 | print( 148 | "Do both models output the same tensors?", 149 | "🔥" if success else "💩" 150 | ) 151 | if not success: 152 | raise Exception("Something went wRoNg") 153 | 154 | print(f"Saving model to {pytorch_dump_folder_path}") 155 | model.save_pretrained(pytorch_dump_folder_path) 156 | 157 | 158 | if __name__ == "__main__": 159 | parser = argparse.ArgumentParser() 160 | ## Required parameters 161 | parser.add_argument("--roberta_checkpoint_path", 162 | default = None, 163 | type = str, 164 | required = True, 165 | help = "Path the official PyTorch dump.") 166 | parser.add_argument("--pytorch_dump_folder_path", 167 | default = None, 168 | type = str, 169 | required = True, 170 | help = "Path to the output PyTorch model.") 171 | parser.add_argument("--classification_head", 172 | action = "store_true", 173 | help = "Whether to convert a final classification head.") 174 | args = parser.parse_args() 175 | convert_roberta_checkpoint_to_pytorch( 176 | args.roberta_checkpoint_path, 177 | args.pytorch_dump_folder_path, 178 | args.classification_head 179 | ) 180 | 181 | -------------------------------------------------------------------------------- /transformers/tokenization_auto.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Auto Model class. """ 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import logging 20 | 21 | from .tokenization_bert import BertTokenizer 22 | from .tokenization_bert_japanese import BertJapaneseTokenizer 23 | from .tokenization_openai import OpenAIGPTTokenizer 24 | from .tokenization_gpt2 import GPT2Tokenizer 25 | from .tokenization_ctrl import CTRLTokenizer 26 | from .tokenization_transfo_xl import TransfoXLTokenizer 27 | from .tokenization_xlnet import XLNetTokenizer 28 | from .tokenization_xlm import XLMTokenizer 29 | from .tokenization_roberta import RobertaTokenizer 30 | from .tokenization_distilbert import DistilBertTokenizer 31 | from .tokenization_camembert import CamembertTokenizer 32 | from .tokenization_albert import AlbertTokenizer 33 | 34 | logger = logging.getLogger(__name__) 35 | 36 | class AutoTokenizer(object): 37 | r""":class:`~transformers.AutoTokenizer` is a generic tokenizer class 38 | that will be instantiated as one of the tokenizer classes of the library 39 | when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` 40 | class method. 41 | 42 | The `from_pretrained()` method take care of returning the correct tokenizer class instance 43 | using pattern matching on the `pretrained_model_name_or_path` string. 44 | 45 | The tokenizer class to instantiate is selected as the first pattern matching 46 | in the `pretrained_model_name_or_path` string (in the following order): 47 | - contains `distilbert`: DistilBertTokenizer (DistilBert model) 48 | - contains `albert`: AlbertTokenizer (ALBERT model) 49 | - contains `camembert`: CamembertTokenizer (CamemBERT model) 50 | - contains `roberta`: RobertaTokenizer (RoBERTa model) 51 | - contains `bert`: BertTokenizer (Bert model) 52 | - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) 53 | - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) 54 | - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) 55 | - contains `xlnet`: XLNetTokenizer (XLNet model) 56 | - contains `xlm`: XLMTokenizer (XLM model) 57 | - contains `ctrl`: CTRLTokenizer (Salesforce CTRL model) 58 | 59 | This class cannot be instantiated using `__init__()` (throw an error). 60 | """ 61 | def __init__(self): 62 | raise EnvironmentError("AutoTokenizer is designed to be instantiated " 63 | "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.") 64 | 65 | @classmethod 66 | def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): 67 | r""" Instantiate a one of the tokenizer classes of the library 68 | from a pre-trained model vocabulary. 69 | 70 | The tokenizer class to instantiate is selected as the first pattern matching 71 | in the `pretrained_model_name_or_path` string (in the following order): 72 | - contains `distilbert`: DistilBertTokenizer (DistilBert model) 73 | - contains `albert`: AlbertTokenizer (ALBERT model) 74 | - contains `camembert`: CamembertTokenizer (CamemBERT model) 75 | - contains `roberta`: RobertaTokenizer (RoBERTa model) 76 | - contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model) 77 | - contains `bert`: BertTokenizer (Bert model) 78 | - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) 79 | - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) 80 | - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) 81 | - contains `xlnet`: XLNetTokenizer (XLNet model) 82 | - contains `xlm`: XLMTokenizer (XLM model) 83 | - contains `ctrl`: CTRLTokenizer (Salesforce CTRL model) 84 | 85 | Params: 86 | pretrained_model_name_or_path: either: 87 | 88 | - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. 89 | - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. 90 | - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. 91 | - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. 92 | 93 | cache_dir: (`optional`) string: 94 | Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. 95 | 96 | force_download: (`optional`) boolean, default False: 97 | Force to (re-)download the vocabulary files and override the cached versions if they exists. 98 | 99 | resume_download: (`optional`) boolean, default False: 100 | Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. 101 | 102 | proxies: (`optional`) dict, default None: 103 | A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. 104 | The proxies are used on each request. 105 | 106 | inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. 107 | 108 | kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details. 109 | 110 | Examples:: 111 | 112 | # Download vocabulary from S3 and cache. 113 | tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') 114 | 115 | # Download vocabulary from S3 (user-uploaded) and cache. 116 | tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased') 117 | 118 | # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) 119 | tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') 120 | 121 | """ 122 | if 'distilbert' in pretrained_model_name_or_path: 123 | return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 124 | elif 'albert' in pretrained_model_name_or_path: 125 | return AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 126 | elif 'camembert' in pretrained_model_name_or_path: 127 | return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 128 | elif 'roberta' in pretrained_model_name_or_path: 129 | return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 130 | elif 'bert-base-japanese' in pretrained_model_name_or_path: 131 | return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 132 | elif 'bert' in pretrained_model_name_or_path: 133 | return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 134 | elif 'openai-gpt' in pretrained_model_name_or_path: 135 | return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 136 | elif 'gpt2' in pretrained_model_name_or_path: 137 | return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 138 | elif 'transfo-xl' in pretrained_model_name_or_path: 139 | return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 140 | elif 'xlnet' in pretrained_model_name_or_path: 141 | return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 142 | elif 'xlm' in pretrained_model_name_or_path: 143 | return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 144 | elif 'ctrl' in pretrained_model_name_or_path: 145 | return CTRLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 146 | raise ValueError("Unrecognized model identifier in {}. Should contains one of " 147 | "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " 148 | "'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path)) 149 | -------------------------------------------------------------------------------- /transformers/configuration_auto.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Auto Model class. """ 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import logging 20 | 21 | from .configuration_bert import BertConfig 22 | from .configuration_openai import OpenAIGPTConfig 23 | from .configuration_gpt2 import GPT2Config 24 | from .configuration_transfo_xl import TransfoXLConfig 25 | from .configuration_xlnet import XLNetConfig 26 | from .configuration_xlm import XLMConfig 27 | from .configuration_roberta import RobertaConfig 28 | from .configuration_distilbert import DistilBertConfig 29 | from .configuration_ctrl import CTRLConfig 30 | from .configuration_camembert import CamembertConfig 31 | from .configuration_albert import AlbertConfig 32 | 33 | logger = logging.getLogger(__name__) 34 | 35 | 36 | class AutoConfig(object): 37 | r""":class:`~transformers.AutoConfig` is a generic configuration class 38 | that will be instantiated as one of the configuration classes of the library 39 | when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` 40 | class method. 41 | 42 | The `from_pretrained()` method take care of returning the correct model class instance 43 | using pattern matching on the `pretrained_model_name_or_path` string. 44 | 45 | The base model class to instantiate is selected as the first pattern matching 46 | in the `pretrained_model_name_or_path` string (in the following order): 47 | - contains `distilbert`: DistilBertConfig (DistilBERT model) 48 | - contains `albert`: AlbertConfig (ALBERT model) 49 | - contains `camembert`: CamembertConfig (CamemBERT model) 50 | - contains `roberta`: RobertaConfig (RoBERTa model) 51 | - contains `bert`: BertConfig (Bert model) 52 | - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) 53 | - contains `gpt2`: GPT2Config (OpenAI GPT-2 model) 54 | - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) 55 | - contains `xlnet`: XLNetConfig (XLNet model) 56 | - contains `xlm`: XLMConfig (XLM model) 57 | - contains `ctrl` : CTRLConfig (CTRL model) 58 | This class cannot be instantiated using `__init__()` (throw an error). 59 | """ 60 | def __init__(self): 61 | raise EnvironmentError("AutoConfig is designed to be instantiated " 62 | "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.") 63 | 64 | @classmethod 65 | def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): 66 | r""" Instantiate a one of the configuration classes of the library 67 | from a pre-trained model configuration. 68 | 69 | The configuration class to instantiate is selected as the first pattern matching 70 | in the `pretrained_model_name_or_path` string (in the following order): 71 | - contains `distilbert`: DistilBertConfig (DistilBERT model) 72 | - contains `albert`: AlbertConfig (ALBERT model) 73 | - contains `camembert`: CamembertConfig (CamemBERT model) 74 | - contains `roberta`: RobertaConfig (RoBERTa model) 75 | - contains `bert`: BertConfig (Bert model) 76 | - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) 77 | - contains `gpt2`: GPT2Config (OpenAI GPT-2 model) 78 | - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) 79 | - contains `xlnet`: XLNetConfig (XLNet model) 80 | - contains `xlm`: XLMConfig (XLM model) 81 | - contains `ctrl` : CTRLConfig (CTRL model) 82 | Params: 83 | pretrained_model_name_or_path: either: 84 | 85 | - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. 86 | - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. 87 | - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. 88 | - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. 89 | 90 | cache_dir: (`optional`) string: 91 | Path to a directory in which a downloaded pre-trained model 92 | configuration should be cached if the standard cache should not be used. 93 | 94 | kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. 95 | 96 | - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. 97 | - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. 98 | 99 | force_download: (`optional`) boolean, default False: 100 | Force to (re-)download the model weights and configuration files and override the cached versions if they exists. 101 | 102 | resume_download: (`optional`) boolean, default False: 103 | Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. 104 | 105 | proxies: (`optional`) dict, default None: 106 | A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. 107 | The proxies are used on each request. 108 | 109 | return_unused_kwargs: (`optional`) bool: 110 | 111 | - If False, then this function returns just the final configuration object. 112 | - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. 113 | 114 | Examples:: 115 | 116 | config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. 117 | config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` 118 | config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json') 119 | config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) 120 | assert config.output_attention == True 121 | config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, 122 | foo=False, return_unused_kwargs=True) 123 | assert config.output_attention == True 124 | assert unused_kwargs == {'foo': False} 125 | 126 | """ 127 | if 'distilbert' in pretrained_model_name_or_path: 128 | return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) 129 | elif 'albert' in pretrained_model_name_or_path: 130 | return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) 131 | elif 'camembert' in pretrained_model_name_or_path: 132 | return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) 133 | elif 'roberta' in pretrained_model_name_or_path: 134 | return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) 135 | elif 'bert' in pretrained_model_name_or_path: 136 | return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) 137 | elif 'openai-gpt' in pretrained_model_name_or_path: 138 | return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) 139 | elif 'gpt2' in pretrained_model_name_or_path: 140 | return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs) 141 | elif 'transfo-xl' in pretrained_model_name_or_path: 142 | return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) 143 | elif 'xlnet' in pretrained_model_name_or_path: 144 | return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) 145 | elif 'xlm' in pretrained_model_name_or_path: 146 | return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) 147 | elif 'ctrl' in pretrained_model_name_or_path: 148 | return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) 149 | raise ValueError("Unrecognized model identifier in {}. Should contains one of " 150 | "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " 151 | "'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path)) 152 | -------------------------------------------------------------------------------- /transformers/optimization_tf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Functions and classes related to optimization (weight updates).""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import re 22 | 23 | import tensorflow as tf 24 | 25 | 26 | class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule): 27 | """Applys a warmup schedule on a given learning rate decay schedule.""" 28 | 29 | def __init__( 30 | self, 31 | initial_learning_rate, 32 | decay_schedule_fn, 33 | warmup_steps, 34 | power=1.0, 35 | name=None): 36 | super(WarmUp, self).__init__() 37 | self.initial_learning_rate = initial_learning_rate 38 | self.warmup_steps = warmup_steps 39 | self.power = power 40 | self.decay_schedule_fn = decay_schedule_fn 41 | self.name = name 42 | 43 | def __call__(self, step): 44 | with tf.name_scope(self.name or 'WarmUp') as name: 45 | # Implements polynomial warmup. i.e., if global_step < warmup_steps, the 46 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 47 | global_step_float = tf.cast(step, tf.float32) 48 | warmup_steps_float = tf.cast(self.warmup_steps, tf.float32) 49 | warmup_percent_done = global_step_float / warmup_steps_float 50 | warmup_learning_rate = ( 51 | self.initial_learning_rate * 52 | tf.math.pow(warmup_percent_done, self.power)) 53 | return tf.cond(global_step_float < warmup_steps_float, 54 | lambda: warmup_learning_rate, 55 | lambda: self.decay_schedule_fn(step), 56 | name=name) 57 | 58 | def get_config(self): 59 | return { 60 | 'initial_learning_rate': self.initial_learning_rate, 61 | 'decay_schedule_fn': self.decay_schedule_fn, 62 | 'warmup_steps': self.warmup_steps, 63 | 'power': self.power, 64 | 'name': self.name 65 | } 66 | 67 | 68 | def create_optimizer(init_lr, num_train_steps, num_warmup_steps): 69 | """Creates an optimizer with learning rate schedule.""" 70 | # Implements linear decay of the learning rate. 71 | learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( 72 | initial_learning_rate=init_lr, 73 | decay_steps=num_train_steps, 74 | end_learning_rate=0.0) 75 | if num_warmup_steps: 76 | learning_rate_fn = WarmUp(initial_learning_rate=init_lr, 77 | decay_schedule_fn=learning_rate_fn, 78 | warmup_steps=num_warmup_steps) 79 | optimizer = AdamWeightDecay( 80 | learning_rate=learning_rate_fn, 81 | weight_decay_rate=0.01, 82 | beta_1=0.9, 83 | beta_2=0.999, 84 | epsilon=1e-6, 85 | exclude_from_weight_decay=['layer_norm', 'bias']) 86 | return optimizer 87 | 88 | 89 | class AdamWeightDecay(tf.keras.optimizers.Adam): 90 | """Adam enables L2 weight decay and clip_by_global_norm on gradients. 91 | 92 | Just adding the square of the weights to the loss function is *not* the 93 | correct way of using L2 regularization/weight decay with Adam, since that will 94 | interact with the m and v parameters in strange ways. 95 | 96 | Instead we want ot decay the weights in a manner that doesn't interact with 97 | the m/v parameters. This is equivalent to adding the square of the weights to 98 | the loss with plain (non-momentum) SGD. 99 | """ 100 | 101 | def __init__(self, 102 | learning_rate=0.001, 103 | beta_1=0.9, 104 | beta_2=0.999, 105 | epsilon=1e-7, 106 | amsgrad=False, 107 | weight_decay_rate=0.0, 108 | include_in_weight_decay=None, 109 | exclude_from_weight_decay=None, 110 | name='AdamWeightDecay', 111 | **kwargs): 112 | super(AdamWeightDecay, self).__init__( 113 | learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs) 114 | self.weight_decay_rate = weight_decay_rate 115 | self._include_in_weight_decay = include_in_weight_decay 116 | self._exclude_from_weight_decay = exclude_from_weight_decay 117 | 118 | @classmethod 119 | def from_config(cls, config): 120 | """Creates an optimizer from its config with WarmUp custom object.""" 121 | custom_objects = {'WarmUp': WarmUp} 122 | return super(AdamWeightDecay, cls).from_config( 123 | config, custom_objects=custom_objects) 124 | 125 | def _prepare_local(self, var_device, var_dtype, apply_state): 126 | super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, 127 | apply_state) 128 | apply_state['weight_decay_rate'] = tf.constant( 129 | self.weight_decay_rate, name='adam_weight_decay_rate') 130 | 131 | def _decay_weights_op(self, var, learning_rate, apply_state): 132 | do_decay = self._do_use_weight_decay(var.name) 133 | if do_decay: 134 | return var.assign_sub( 135 | learning_rate * var * 136 | apply_state['weight_decay_rate'], 137 | use_locking=self._use_locking) 138 | return tf.no_op() 139 | 140 | def apply_gradients(self, grads_and_vars, clip_norm, name=None): 141 | grads, tvars = list(zip(*grads_and_vars)) 142 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm) 143 | return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars)) 144 | 145 | def _get_lr(self, var_device, var_dtype, apply_state): 146 | """Retrieves the learning rate with the given state.""" 147 | if apply_state is None: 148 | return self._decayed_lr_t[var_dtype], {} 149 | 150 | apply_state = apply_state or {} 151 | coefficients = apply_state.get((var_device, var_dtype)) 152 | if coefficients is None: 153 | coefficients = self._fallback_apply_state(var_device, var_dtype) 154 | apply_state[(var_device, var_dtype)] = coefficients 155 | 156 | return coefficients['lr_t'], dict(apply_state=apply_state) 157 | 158 | def _resource_apply_dense(self, grad, var, apply_state=None): 159 | lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) 160 | decay = self._decay_weights_op(var, lr_t, apply_state) 161 | with tf.control_dependencies([decay]): 162 | return super(AdamWeightDecay, self)._resource_apply_dense( 163 | grad, var, **kwargs) 164 | 165 | def _resource_apply_sparse(self, grad, var, indices, apply_state=None): 166 | lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) 167 | decay = self._decay_weights_op(var, lr_t, apply_state) 168 | with tf.control_dependencies([decay]): 169 | return super(AdamWeightDecay, self)._resource_apply_sparse( 170 | grad, var, indices, **kwargs) 171 | 172 | def get_config(self): 173 | config = super(AdamWeightDecay, self).get_config() 174 | config.update({ 175 | 'weight_decay_rate': self.weight_decay_rate, 176 | }) 177 | return config 178 | 179 | def _do_use_weight_decay(self, param_name): 180 | """Whether to use L2 weight decay for `param_name`.""" 181 | if self.weight_decay_rate == 0: 182 | return False 183 | 184 | if self._include_in_weight_decay: 185 | for r in self._include_in_weight_decay: 186 | if re.search(r, param_name) is not None: 187 | return True 188 | 189 | if self._exclude_from_weight_decay: 190 | for r in self._exclude_from_weight_decay: 191 | if re.search(r, param_name) is not None: 192 | return False 193 | return True 194 | 195 | 196 | ## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py 197 | class GradientAccumulator(object): 198 | """Distribution strategies-aware gradient accumulation utility.""" 199 | 200 | def __init__(self): 201 | """Initializes the accumulator.""" 202 | self._gradients = [] 203 | self._accum_steps = tf.Variable( 204 | initial_value=0, 205 | dtype=tf.int64, 206 | trainable=False, 207 | aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA) 208 | 209 | @property 210 | def step(self): 211 | """Number of accumulated steps.""" 212 | return self._accum_steps.value() 213 | 214 | @property 215 | def gradients(self): 216 | """The accumulated gradients.""" 217 | return list(gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients()) 218 | 219 | def __call__(self, gradients): 220 | """Accumulates :obj:`gradients`.""" 221 | if not self._gradients: 222 | self._gradients.extend([tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient for gradient in gradients]) 223 | 224 | if len(gradients) != len(self._gradients): 225 | raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients))) 226 | 227 | for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients): 228 | if accum_gradient is not None: 229 | accum_gradient.assign_add(gradient) 230 | 231 | self._accum_steps.assign_add(1) 232 | 233 | def reset(self): 234 | """Resets the accumulated gradients.""" 235 | if self._gradients: 236 | self._accum_steps.assign(0) 237 | 238 | for gradient in self._get_replica_gradients(): 239 | if gradient is not None: 240 | gradient.assign(tf.zeros_like(gradient)) 241 | 242 | def _get_replica_gradients(self): 243 | if tf.distribute.has_strategy(): 244 | # In a replica context, we want to accumulate gradients on each replica 245 | # without synchronization, so we directly assign the value of the 246 | # current replica. 247 | replica_context = tf.distribute.get_replica_context() 248 | 249 | if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1: 250 | return self._gradients 251 | 252 | return (gradient.device_map.select_for_current_replica(gradient.values, replica_context) for gradient in self._gradients) 253 | else: 254 | return self._gradients 255 | --------------------------------------------------------------------------------