├── requirements.txt
├── transformers
    ├── commands
    │   ├── __init__.py
    │   └── user.py
    ├── data
    │   ├── processors
    │   │   ├── __init__.py
    │   │   ├── xnli.py
    │   │   └── utils.py
    │   ├── __init__.py
    │   └── metrics
    │   │   └── __init__.py
    ├── configuration_camembert.py
    ├── configuration_roberta.py
    ├── convert_bert_original_tf_checkpoint_to_pytorch.py
    ├── convert_albert_original_tf_checkpoint_to_pytorch.py
    ├── tokenization_distilbert.py
    ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py
    ├── convert_openai_original_tf_checkpoint_to_pytorch.py
    ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py
    ├── configuration_distilbert.py
    ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py
    ├── convert_bert_pytorch_checkpoint_to_original_tf.py
    ├── configuration_albert.py
    ├── configuration_openai.py
    ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
    ├── configuration_ctrl.py
    ├── configuration_gpt2.py
    ├── configuration_xlnet.py
    ├── configuration_transfo_xl.py
    ├── __main__.py
    ├── hf_api.py
    ├── configuration_bert.py
    ├── tokenization_roberta.py
    ├── tokenization_camembert.py
    ├── optimization.py
    ├── tokenization_openai.py
    ├── configuration_xlm.py
    ├── modeling_tf_transfo_xl_utilities.py
    ├── tokenization_ctrl.py
    ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py
    ├── tokenization_auto.py
    ├── configuration_auto.py
    └── optimization_tf.py
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboardX
2 | tensorboard
3 | scikit-learn
4 | seqeval
5 | 


--------------------------------------------------------------------------------
/transformers/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from argparse import ArgumentParser
 3 | 
 4 | class BaseTransformersCLICommand(ABC):
 5 |     @staticmethod
 6 |     @abstractmethod
 7 |     def register_subcommand(parser: ArgumentParser):
 8 |         raise NotImplementedError()
 9 | 
10 |     @abstractmethod
11 |     def run(self):
12 |         raise NotImplementedError()
13 | 


--------------------------------------------------------------------------------
/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import InputExample, InputFeatures, DataProcessor
2 | from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
3 | from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
4 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels


--------------------------------------------------------------------------------
/transformers/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures
2 | from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
3 | from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
4 | from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
5 | 
6 | from .metrics import is_sklearn_available
7 | if is_sklearn_available():
8 |     from .metrics import glue_compute_metrics, xnli_compute_metrics
9 | 


--------------------------------------------------------------------------------
/transformers/configuration_camembert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ CamemBERT configuration """
17 | 
18 | from __future__ import (absolute_import, division, print_function,
19 |                         unicode_literals)
20 | 
21 | import logging
22 | 
23 | from .configuration_roberta import RobertaConfig
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28 |     'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
29 | }
30 | 
31 | 
32 | class CamembertConfig(RobertaConfig):
33 |     pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ALBERT + DUMA
 2 | 
 3 | This is the source code of our paper 《[DUMA: Reading Comprehension with Transposition Thinking](https://ieeexplore.ieee.org/document/9664302)》. The codes are written based on https://github.com/huggingface/transformers .
 4 | 
 5 | The codes are tested with pytorch 1.0.0 and python 3.6. If you want to use fp16 for training, please make sure the version is commit 33512f9 of https://github.com/NVIDIA/apex .
 6 | 
 7 | It is recommended to download the model, config and vocab file and replace the path in trainsformers/{modeling_albert.py, configuration_albert.py, tokenization_albert.py}.
 8 | 
 9 | Download the train.json, dev.json, test.json from https://github.com/nlpdata/dream/tree/master/data and save them into DATA_DIR.
10 | 
11 | To run ALBERT on DREAM dataset, the script is:
12 | ```bash
13 | export DATA_DIR=/path/to/data
14 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python run_multiple_choice.py \
15 | --do_lower_case \
16 | --do_train \
17 | --do_eval \
18 | --overwrite_output \
19 | --overwrite_cache \
20 | --eval_all_checkpoints \
21 | --task_name dream \
22 | --per_gpu_eval_batch_size=10 \
23 | --logging_steps 1 \
24 | --max_seq_length 512 \
25 | --model_type albert \
26 | --model_name_or_path albert-base-v2 \
27 | --data_dir $DATA_DIR \
28 | --learning_rate 5e-6 \
29 | --num_train_epochs 15 \
30 | --output_dir albert_base_dream \
31 | --per_gpu_train_batch_size=1 \
32 | --gradient_accumulation_steps 1 \
33 | --warmup_steps 100 \
34 | --save_steps 764
35 | ```
36 | 
37 | To run ALBERT+DUMA on DREAM dataset, you should replace AlbertForMultipleChoice with AlbertDUMAForMultipleChoice.
38 | 
39 | Performance outputs of checkpoints will be saved in my_eval_results.txt .
40 | 
41 | For Albert xxlarge, please refer to the paper for parameter settings.
42 | 
43 | More details will be added in the future.
44 | 


--------------------------------------------------------------------------------
/transformers/configuration_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 | 
18 | from __future__ import (absolute_import, division, print_function,
19 |                         unicode_literals)
20 | 
21 | import logging
22 | 
23 | from .configuration_bert import BertConfig
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28 |     'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
29 |     'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
30 |     'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
31 |     'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json",
32 |     'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json",
33 |     'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json",
34 | }
35 | 
36 | 
37 | class RobertaConfig(BertConfig):
38 |     pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
39 | 


--------------------------------------------------------------------------------
/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import argparse
22 | import torch
23 | 
24 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
25 | 
26 | import logging
27 | logging.basicConfig(level=logging.INFO)
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = BertConfig.from_json_file(bert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = BertForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_bert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     ## Required parameters
46 |     parser.add_argument("--tf_checkpoint_path",
47 |                         default = None,
48 |                         type = str,
49 |                         required = True,
50 |                         help = "Path to the TensorFlow checkpoint path.")
51 |     parser.add_argument("--bert_config_file",
52 |                         default = None,
53 |                         type = str,
54 |                         required = True,
55 |                         help = "The config json file corresponding to the pre-trained BERT model. \n"
56 |                             "This specifies the model architecture.")
57 |     parser.add_argument("--pytorch_dump_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the output PyTorch model.")
62 |     args = parser.parse_args()
63 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
64 |                                      args.bert_config_file,
65 |                                      args.pytorch_dump_path)
66 | 


--------------------------------------------------------------------------------
/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert ALBERT checkpoint."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import argparse
22 | import torch
23 | 
24 | from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
25 | 
26 | import logging
27 | logging.basicConfig(level=logging.INFO)
28 | 
29 | 
30 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
31 |     # Initialise PyTorch model
32 |     config = AlbertConfig.from_json_file(albert_config_file)
33 |     print("Building PyTorch model from configuration: {}".format(str(config)))
34 |     model = AlbertForMaskedLM(config)
35 | 
36 |     # Load weights from tf checkpoint
37 |     load_tf_weights_in_albert(model, config, tf_checkpoint_path)
38 | 
39 |     # Save pytorch-model
40 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
41 |     torch.save(model.state_dict(), pytorch_dump_path)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     parser = argparse.ArgumentParser()
46 |     ## Required parameters
47 |     parser.add_argument("--tf_checkpoint_path",
48 |                         default = None,
49 |                         type = str,
50 |                         required = True,
51 |                         help = "Path to the TensorFlow checkpoint path.")
52 |     parser.add_argument("--albert_config_file",
53 |                         default = None,
54 |                         type = str,
55 |                         required = True,
56 |                         help = "The config json file corresponding to the pre-trained ALBERT model. \n"
57 |                             "This specifies the model architecture.")
58 |     parser.add_argument("--pytorch_dump_path",
59 |                         default = None,
60 |                         type = str,
61 |                         required = True,
62 |                         help = "Path to the output PyTorch model.")
63 |     args = parser.parse_args()
64 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
65 |                                      args.albert_config_file,
66 |                                      args.pytorch_dump_path)
67 |  


--------------------------------------------------------------------------------
/transformers/tokenization_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for DistilBERT."""
16 | 
17 | from __future__ import absolute_import, division, print_function, unicode_literals
18 | 
19 | import collections
20 | import logging
21 | import os
22 | import unicodedata
23 | from io import open
24 | 
25 | from .tokenization_bert import BertTokenizer
26 | 
27 | logger = logging.getLogger(__name__)
28 | 
29 | VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
30 | 
31 | PRETRAINED_VOCAB_FILES_MAP = {
32 |     'vocab_file':
33 |     {
34 |         'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
35 |         'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
36 |         'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
37 |         'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
38 |     }
39 | }
40 | 
41 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
42 |     'distilbert-base-uncased': 512,
43 |     'distilbert-base-uncased-distilled-squad': 512,
44 |     'distilbert-base-german-cased': 512,
45 |     'distilbert-base-multilingual-cased': 512,
46 | }
47 | 
48 | 
49 | class DistilBertTokenizer(BertTokenizer):
50 |     r"""
51 |     Constructs a DistilBertTokenizer.
52 |     :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
53 | 
54 |     Args:
55 |         vocab_file: Path to a one-wordpiece-per-line vocabulary file
56 |         do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
57 |         do_basic_tokenize: Whether to do basic tokenization before wordpiece.
58 |         max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
59 |             minimum of this value (if specified) and the underlying BERT model's sequence length.
60 |         never_split: List of tokens which will never be split during tokenization. Only has an effect when
61 |             do_wordpiece_only=False
62 |     """
63 | 
64 |     vocab_files_names = VOCAB_FILES_NAMES
65 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
66 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
67 | 


--------------------------------------------------------------------------------
/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | from io import open
21 | 
22 | import torch
23 | 
24 | from transformers import (CONFIG_NAME, WEIGHTS_NAME,
25 |                                                      GPT2Config,
26 |                                                      GPT2Model,
27 |                                                      load_tf_weights_in_gpt2)
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | 
33 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
34 |     # Construct model
35 |     if gpt2_config_file == "":
36 |         config = GPT2Config()
37 |     else:
38 |         config = GPT2Config.from_json_file(gpt2_config_file)
39 |     model = GPT2Model(config)
40 | 
41 |     # Load weights from numpy
42 |     load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
43 | 
44 |     # Save pytorch-model
45 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
46 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
47 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
48 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
49 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
50 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
51 |         f.write(config.to_json_string())
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     parser = argparse.ArgumentParser()
56 |     ## Required parameters
57 |     parser.add_argument("--gpt2_checkpoint_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the TensorFlow checkpoint path.")
62 |     parser.add_argument("--pytorch_dump_folder_path",
63 |                         default = None,
64 |                         type = str,
65 |                         required = True,
66 |                         help = "Path to the output PyTorch model.")
67 |     parser.add_argument("--gpt2_config_file",
68 |                         default = "",
69 |                         type = str,
70 |                         help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
71 |                             "This specifies the model architecture.")
72 |     args = parser.parse_args()
73 |     convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path,
74 |                                          args.gpt2_config_file,
75 |                                          args.pytorch_dump_folder_path)
76 | 


--------------------------------------------------------------------------------
/transformers/data/processors/xnli.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ XNLI utils (dataset loading and evaluation) """
17 | 
18 | from __future__ import absolute_import, division, print_function
19 | 
20 | import logging
21 | import os
22 | 
23 | from .utils import DataProcessor, InputExample
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | class XnliProcessor(DataProcessor):
28 |     """Processor for the XNLI dataset.
29 |     Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
30 | 
31 |     def __init__(self, language, train_language = None):
32 |         self.language = language
33 |         self.train_language = train_language
34 | 
35 |     def get_train_examples(self, data_dir):
36 |         """See base class."""
37 |         lg = self.language if self.train_language is None else self.train_language
38 |         lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg)))
39 |         examples = []
40 |         for (i, line) in enumerate(lines):
41 |             if i == 0:
42 |                 continue
43 |             guid = "%s-%s" % ('train', i)
44 |             text_a = line[0]
45 |             text_b = line[1]
46 |             label = "contradiction" if line[2] == "contradictory" else line[2]
47 |             assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
48 |             examples.append(
49 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
50 |         return examples
51 | 
52 |     def get_test_examples(self, data_dir):
53 |         """See base class."""
54 |         lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
55 |         examples = []
56 |         for (i, line) in enumerate(lines):
57 |             if i == 0:
58 |                 continue
59 |             language = line[0]
60 |             if language != self.language:
61 |                 continue
62 |             guid = "%s-%s" % ('test', i)
63 |             text_a = line[6]
64 |             text_b = line[7]
65 |             label = line[1]
66 |             assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
67 |             examples.append(
68 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
69 |         return examples
70 | 
71 |     def get_labels(self):
72 |         """See base class."""
73 |         return ["contradiction", "entailment", "neutral"]
74 | 
75 | xnli_processors = {
76 |     "xnli": XnliProcessor,
77 | }
78 | 
79 | xnli_output_modes = {
80 |     "xnli": "classification",
81 | }
82 | 
83 | xnli_tasks_num_labels = {
84 |     "xnli": 3,
85 | }
86 | 


--------------------------------------------------------------------------------
/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | from io import open
21 | 
22 | import torch
23 | 
24 | from transformers import (CONFIG_NAME, WEIGHTS_NAME,
25 |                                                      OpenAIGPTConfig,
26 |                                                      OpenAIGPTModel,
27 |                                                      load_tf_weights_in_openai_gpt)
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | 
33 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
34 |     # Construct model
35 |     if openai_config_file == "":
36 |         config = OpenAIGPTConfig()
37 |     else:
38 |         config = OpenAIGPTConfig.from_json_file(openai_config_file)
39 |     model = OpenAIGPTModel(config)
40 | 
41 |     # Load weights from numpy
42 |     load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
43 | 
44 |     # Save pytorch-model
45 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
46 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
47 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
48 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
49 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
50 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
51 |         f.write(config.to_json_string())
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     parser = argparse.ArgumentParser()
56 |     ## Required parameters
57 |     parser.add_argument("--openai_checkpoint_folder_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the TensorFlow checkpoint path.")
62 |     parser.add_argument("--pytorch_dump_folder_path",
63 |                         default = None,
64 |                         type = str,
65 |                         required = True,
66 |                         help = "Path to the output PyTorch model.")
67 |     parser.add_argument("--openai_config_file",
68 |                         default = "",
69 |                         type = str,
70 |                         help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
71 |                             "This specifies the model architecture.")
72 |     args = parser.parse_args()
73 |     convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
74 |                                          args.openai_config_file,
75 |                                          args.pytorch_dump_folder_path)
76 | 


--------------------------------------------------------------------------------
/transformers/data/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | import csv
18 | import sys
19 | import logging
20 | 
21 | logger = logging.getLogger(__name__)
22 | 
23 | try:
24 |     from scipy.stats import pearsonr, spearmanr
25 |     from sklearn.metrics import matthews_corrcoef, f1_score
26 |     _has_sklearn = True
27 | except (AttributeError, ImportError) as e:
28 |     logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")
29 |     _has_sklearn = False
30 | 
31 | def is_sklearn_available():
32 |     return _has_sklearn
33 | 
34 | if _has_sklearn:
35 | 
36 |     def simple_accuracy(preds, labels):
37 |         return (preds == labels).mean()
38 | 
39 | 
40 |     def acc_and_f1(preds, labels):
41 |         acc = simple_accuracy(preds, labels)
42 |         f1 = f1_score(y_true=labels, y_pred=preds)
43 |         return {
44 |             "acc": acc,
45 |             "f1": f1,
46 |             "acc_and_f1": (acc + f1) / 2,
47 |         }
48 | 
49 | 
50 |     def pearson_and_spearman(preds, labels):
51 |         pearson_corr = pearsonr(preds, labels)[0]
52 |         spearman_corr = spearmanr(preds, labels)[0]
53 |         return {
54 |             "pearson": pearson_corr,
55 |             "spearmanr": spearman_corr,
56 |             "corr": (pearson_corr + spearman_corr) / 2,
57 |         }
58 | 
59 | 
60 |     def glue_compute_metrics(task_name, preds, labels):
61 |         assert len(preds) == len(labels)
62 |         if task_name == "cola":
63 |             return {"mcc": matthews_corrcoef(labels, preds)}
64 |         elif task_name == "sst-2":
65 |             return {"acc": simple_accuracy(preds, labels)}
66 |         elif task_name == "mrpc":
67 |             return acc_and_f1(preds, labels)
68 |         elif task_name == "sts-b":
69 |             return pearson_and_spearman(preds, labels)
70 |         elif task_name == "qqp":
71 |             return acc_and_f1(preds, labels)
72 |         elif task_name == "mnli":
73 |             return {"acc": simple_accuracy(preds, labels)}
74 |         elif task_name == "mnli-mm":
75 |             return {"acc": simple_accuracy(preds, labels)}
76 |         elif task_name == "qnli":
77 |             return {"acc": simple_accuracy(preds, labels)}
78 |         elif task_name == "rte":
79 |             return {"acc": simple_accuracy(preds, labels)}
80 |         elif task_name == "wnli":
81 |             return {"acc": simple_accuracy(preds, labels)}
82 |         else:
83 |             raise KeyError(task_name)
84 | 
85 | 
86 |     def xnli_compute_metrics(task_name, preds, labels):
87 |         assert len(preds) == len(labels)
88 |         if task_name == "xnli":
89 |             return {"acc": simple_accuracy(preds, labels)}
90 |         else:
91 |             raise KeyError(task_name)
92 | 


--------------------------------------------------------------------------------
/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | import json
21 | from io import open
22 | 
23 | import torch
24 | import numpy
25 | 
26 | from transformers import CONFIG_NAME, WEIGHTS_NAME
27 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
33 |     # Load checkpoint
34 |     chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
35 | 
36 |     state_dict = chkpt['model']
37 | 
38 |     # We have the base model one level deeper than the original XLM repository
39 |     two_levels_state_dict = {}
40 |     for k, v in state_dict.items():
41 |         if 'pred_layer' in k:
42 |             two_levels_state_dict[k] = v
43 |         else:
44 |             two_levels_state_dict['transformer.' + k] = v
45 | 
46 |     config = chkpt['params']
47 |     config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
48 | 
49 |     vocab = chkpt['dico_word2id']
50 |     vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items())
51 | 
52 |     # Save pytorch-model
53 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
54 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
55 |     pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
56 | 
57 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
58 |     torch.save(two_levels_state_dict, pytorch_weights_dump_path)
59 | 
60 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
61 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
62 |         f.write(json.dumps(config, indent=2) + "\n")
63 | 
64 |     print("Save vocab file to {}".format(pytorch_config_dump_path))
65 |     with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
66 |         f.write(json.dumps(vocab, indent=2) + "\n")
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     parser = argparse.ArgumentParser()
71 |     ## Required parameters
72 |     parser.add_argument("--xlm_checkpoint_path",
73 |                         default = None,
74 |                         type = str,
75 |                         required = True,
76 |                         help = "Path the official PyTorch dump.")
77 |     parser.add_argument("--pytorch_dump_folder_path",
78 |                         default = None,
79 |                         type = str,
80 |                         required = True,
81 |                         help = "Path to the output PyTorch model.")
82 |     args = parser.parse_args()
83 |     convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
84 | 


--------------------------------------------------------------------------------
/transformers/configuration_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ DistilBERT model configuration """
16 | from __future__ import (absolute_import, division, print_function,
17 |                         unicode_literals)
18 | 
19 | import sys
20 | import json
21 | import logging
22 | from io import open
23 | 
24 | from .configuration_utils import PretrainedConfig
25 | 
26 | logger = logging.getLogger(__name__)
27 | 
28 | DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
29 |     'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
30 |     'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json",
31 |     'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json",
32 |     'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json",
33 | }
34 | 
35 | 
36 | class DistilBertConfig(PretrainedConfig):
37 |     pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
38 | 
39 |     def __init__(self,
40 |                  vocab_size_or_config_json_file=30522,
41 |                  max_position_embeddings=512,
42 |                  sinusoidal_pos_embds=False,
43 |                  n_layers=6,
44 |                  n_heads=12,
45 |                  dim=768,
46 |                  hidden_dim=4*768,
47 |                  dropout=0.1,
48 |                  attention_dropout=0.1,
49 |                  activation='gelu',
50 |                  initializer_range=0.02,
51 |                  tie_weights_=True,
52 |                  qa_dropout=0.1,
53 |                  seq_classif_dropout=0.2,
54 |                  **kwargs):
55 |         super(DistilBertConfig, self).__init__(**kwargs)
56 | 
57 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
58 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
59 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
60 |                 json_config = json.loads(reader.read())
61 |             for key, value in json_config.items():
62 |                 self.__dict__[key] = value
63 |         elif isinstance(vocab_size_or_config_json_file, int):
64 |             self.vocab_size = vocab_size_or_config_json_file
65 |             self.max_position_embeddings = max_position_embeddings
66 |             self.sinusoidal_pos_embds = sinusoidal_pos_embds
67 |             self.n_layers = n_layers
68 |             self.n_heads = n_heads
69 |             self.dim = dim
70 |             self.hidden_dim = hidden_dim
71 |             self.dropout = dropout
72 |             self.attention_dropout = attention_dropout
73 |             self.activation = activation
74 |             self.initializer_range = initializer_range
75 |             self.tie_weights_ = tie_weights_
76 |             self.qa_dropout = qa_dropout
77 |             self.seq_classif_dropout = seq_classif_dropout
78 |         else:
79 |             raise ValueError("First argument must be either a vocabulary size (int)"
80 |                              " or the path to a pretrained model config file (str)")
81 |     @property
82 |     def hidden_size(self):
83 |         return self.dim
84 | 
85 |     @property
86 |     def num_attention_heads(self):
87 |         return self.n_heads
88 | 
89 |     @property
90 |     def num_hidden_layers(self):
91 |         return self.n_layers
92 | 


--------------------------------------------------------------------------------
/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert BERT checkpoint."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import os
 22 | import argparse
 23 | import torch
 24 | 
 25 | from transformers import (CONFIG_NAME, WEIGHTS_NAME,
 26 |                                                     XLNetConfig,
 27 |                                                     XLNetLMHeadModel, XLNetForQuestionAnswering,
 28 |                                                     XLNetForSequenceClassification,
 29 |                                                     load_tf_weights_in_xlnet)
 30 | 
 31 | GLUE_TASKS_NUM_LABELS = {
 32 |     "cola": 2,
 33 |     "mnli": 3,
 34 |     "mrpc": 2,
 35 |     "sst-2": 2,
 36 |     "sts-b": 1,
 37 |     "qqp": 2,
 38 |     "qnli": 2,
 39 |     "rte": 2,
 40 |     "wnli": 2,
 41 | }
 42 | 
 43 | import logging
 44 | logging.basicConfig(level=logging.INFO)
 45 | 
 46 | def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None):
 47 |     # Initialise PyTorch model
 48 |     config = XLNetConfig.from_json_file(bert_config_file)
 49 | 
 50 |     finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
 51 |     if finetuning_task in GLUE_TASKS_NUM_LABELS:
 52 |         print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
 53 |         config.finetuning_task = finetuning_task
 54 |         config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
 55 |         model = XLNetForSequenceClassification(config)
 56 |     elif 'squad' in finetuning_task:
 57 |         config.finetuning_task = finetuning_task
 58 |         model = XLNetForQuestionAnswering(config)
 59 |     else:
 60 |         model = XLNetLMHeadModel(config)
 61 | 
 62 |     # Load weights from tf checkpoint
 63 |     load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
 64 | 
 65 |     # Save pytorch-model
 66 |     pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
 67 |     pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
 68 |     print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
 69 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
 70 |     print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
 71 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
 72 |         f.write(config.to_json_string())
 73 | 
 74 | 
 75 | if __name__ == "__main__":
 76 |     parser = argparse.ArgumentParser()
 77 |     ## Required parameters
 78 |     parser.add_argument("--tf_checkpoint_path",
 79 |                         default = None,
 80 |                         type = str,
 81 |                         required = True,
 82 |                         help = "Path to the TensorFlow checkpoint path.")
 83 |     parser.add_argument("--xlnet_config_file",
 84 |                         default = None,
 85 |                         type = str,
 86 |                         required = True,
 87 |                         help = "The config json file corresponding to the pre-trained XLNet model. \n"
 88 |                                "This specifies the model architecture.")
 89 |     parser.add_argument("--pytorch_dump_folder_path",
 90 |                         default = None,
 91 |                         type = str,
 92 |                         required = True,
 93 |                         help = "Path to the folder to store the PyTorch model or dataset/vocab.")
 94 |     parser.add_argument("--finetuning_task",
 95 |                         default = None,
 96 |                         type = str,
 97 |                         help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned")
 98 |     args = parser.parse_args()
 99 |     print(args)
100 | 
101 |     convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path,
102 |                                         args.xlnet_config_file,
103 |                                         args.pytorch_dump_folder_path,
104 |                                         args.finetuning_task)
105 | 


--------------------------------------------------------------------------------
/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""
 17 | 
 18 | import os
 19 | import argparse
 20 | import torch
 21 | import numpy as np
 22 | import tensorflow as tf
 23 | from transformers import BertModel
 24 | 
 25 | 
 26 | def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
 27 | 
 28 |     """
 29 |     :param model:BertModel Pytorch model instance to be converted
 30 |     :param ckpt_dir: Tensorflow model directory
 31 |     :param model_name: model name
 32 |     :return:
 33 | 
 34 |     Currently supported HF models:
 35 |         Y BertModel
 36 |         N BertForMaskedLM
 37 |         N BertForPreTraining
 38 |         N BertForMultipleChoice
 39 |         N BertForNextSentencePrediction
 40 |         N BertForSequenceClassification
 41 |         N BertForQuestionAnswering
 42 |     """
 43 | 
 44 |     tensors_to_transpose = (
 45 |         "dense.weight",
 46 |         "attention.self.query",
 47 |         "attention.self.key",
 48 |         "attention.self.value"
 49 |     )
 50 | 
 51 |     var_map = (
 52 |         ('layer.', 'layer_'),
 53 |         ('word_embeddings.weight', 'word_embeddings'),
 54 |         ('position_embeddings.weight', 'position_embeddings'),
 55 |         ('token_type_embeddings.weight', 'token_type_embeddings'),
 56 |         ('.', '/'),
 57 |         ('LayerNorm/weight', 'LayerNorm/gamma'),
 58 |         ('LayerNorm/bias', 'LayerNorm/beta'),
 59 |         ('weight', 'kernel')
 60 |     )
 61 | 
 62 |     if not os.path.isdir(ckpt_dir):
 63 |         os.makedirs(ckpt_dir)
 64 | 
 65 |     state_dict = model.state_dict()
 66 | 
 67 |     def to_tf_var_name(name:str):
 68 |         for patt, repl in iter(var_map):
 69 |             name = name.replace(patt, repl)
 70 |         return 'bert/{}'.format(name)
 71 | 
 72 |     def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session):
 73 |         tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
 74 |         tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
 75 |         session.run(tf.variables_initializer([tf_var]))
 76 |         session.run(tf_var)
 77 |         return tf_var
 78 | 
 79 |     tf.reset_default_graph()
 80 |     with tf.Session() as session:
 81 |         for var_name in state_dict:
 82 |             tf_name = to_tf_var_name(var_name)
 83 |             torch_tensor = state_dict[var_name].numpy()
 84 |             if any([x in var_name for x in tensors_to_transpose]):
 85 |                 torch_tensor = torch_tensor.T
 86 |             tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
 87 |             tf.keras.backend.set_value(tf_var, torch_tensor)
 88 |             tf_weight = session.run(tf_var)
 89 |             print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
 90 | 
 91 |         saver = tf.train.Saver(tf.trainable_variables())
 92 |         saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
 93 | 
 94 | 
 95 | def main(raw_args=None):
 96 |     parser = argparse.ArgumentParser()
 97 |     parser.add_argument("--model_name",
 98 |                         type=str,
 99 |                         required=True,
100 |                         help="model name e.g. bert-base-uncased")
101 |     parser.add_argument("--cache_dir",
102 |                         type=str,
103 |                         default=None,
104 |                         required=False,
105 |                         help="Directory containing pytorch model")
106 |     parser.add_argument("--pytorch_model_path",
107 |                         type=str,
108 |                         required=True,
109 |                         help="/path/to/<pytorch-model-name>.bin")
110 |     parser.add_argument("--tf_cache_dir",
111 |                         type=str,
112 |                         required=True,
113 |                         help="Directory in which to save tensorflow model")
114 |     args = parser.parse_args(raw_args)
115 |     
116 |     model = BertModel.from_pretrained(
117 |         pretrained_model_name_or_path=args.model_name,
118 |         state_dict=torch.load(args.pytorch_model_path),
119 |         cache_dir=args.cache_dir
120 |     )
121 |     
122 |     convert_pytorch_checkpoint_to_tf(
123 |         model=model,
124 |         ckpt_dir=args.tf_cache_dir,
125 |         model_name=args.model_name
126 |     )
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     main()
131 | 


--------------------------------------------------------------------------------
/transformers/data/processors/utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import csv
 18 | import sys
 19 | import copy
 20 | import json
 21 | 
 22 | class InputExample(object):
 23 |     """
 24 |     A single training/test example for simple sequence classification.
 25 | 
 26 |     Args:
 27 |         guid: Unique id for the example.
 28 |         text_a: string. The untokenized text of the first sequence. For single
 29 |         sequence tasks, only this sequence must be specified.
 30 |         text_b: (Optional) string. The untokenized text of the second sequence.
 31 |         Only must be specified for sequence pair tasks.
 32 |         label: (Optional) string. The label of the example. This should be
 33 |         specified for train and dev examples, but not for test examples.
 34 |     """
 35 |     def __init__(self, guid, text_a, text_b=None, label=None):
 36 |         self.guid = guid
 37 |         self.text_a = text_a
 38 |         self.text_b = text_b
 39 |         self.label = label
 40 | 
 41 |     def __repr__(self):
 42 |         return str(self.to_json_string())
 43 | 
 44 |     def to_dict(self):
 45 |         """Serializes this instance to a Python dictionary."""
 46 |         output = copy.deepcopy(self.__dict__)
 47 |         return output
 48 | 
 49 |     def to_json_string(self):
 50 |         """Serializes this instance to a JSON string."""
 51 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 52 | 
 53 | 
 54 | class InputFeatures(object):
 55 |     """
 56 |     A single set of features of data.
 57 | 
 58 |     Args:
 59 |         input_ids: Indices of input sequence tokens in the vocabulary.
 60 |         attention_mask: Mask to avoid performing attention on padding token indices.
 61 |             Mask values selected in ``[0, 1]``:
 62 |             Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
 63 |         token_type_ids: Segment token indices to indicate first and second portions of the inputs.
 64 |         label: Label corresponding to the input
 65 |     """
 66 | 
 67 |     def __init__(self, input_ids, attention_mask, token_type_ids, label):
 68 |         self.input_ids = input_ids
 69 |         self.attention_mask = attention_mask
 70 |         self.token_type_ids = token_type_ids
 71 |         self.label = label
 72 | 
 73 |     def __repr__(self):
 74 |         return str(self.to_json_string())
 75 | 
 76 |     def to_dict(self):
 77 |         """Serializes this instance to a Python dictionary."""
 78 |         output = copy.deepcopy(self.__dict__)
 79 |         return output
 80 | 
 81 |     def to_json_string(self):
 82 |         """Serializes this instance to a JSON string."""
 83 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 84 | 
 85 | 
 86 | class DataProcessor(object):
 87 |     """Base class for data converters for sequence classification data sets."""
 88 | 
 89 |     def get_example_from_tensor_dict(self, tensor_dict):
 90 |         """Gets an example from a dict with tensorflow tensors
 91 | 
 92 |         Args:
 93 |             tensor_dict: Keys and values should match the corresponding Glue
 94 |                 tensorflow_dataset examples.
 95 |         """
 96 |         raise NotImplementedError()
 97 | 
 98 |     def get_train_examples(self, data_dir):
 99 |         """Gets a collection of `InputExample`s for the train set."""
100 |         raise NotImplementedError()
101 | 
102 |     def get_dev_examples(self, data_dir):
103 |         """Gets a collection of `InputExample`s for the dev set."""
104 |         raise NotImplementedError()
105 | 
106 |     def get_labels(self):
107 |         """Gets the list of labels for this data set."""
108 |         raise NotImplementedError()
109 | 
110 |     def tfds_map(self, example):
111 |         """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. 
112 |         This method converts examples to the correct format."""
113 |         if len(self.get_labels()) > 1:
114 |             example.label = self.get_labels()[int(example.label)]
115 |         return example
116 | 
117 |     @classmethod
118 |     def _read_tsv(cls, input_file, quotechar=None):
119 |         """Reads a tab separated value file."""
120 |         with open(input_file, "r", encoding="utf-8-sig") as f:
121 |             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
122 |             lines = []
123 |             for line in reader:
124 |                 if sys.version_info[0] == 2:
125 |                     line = list(unicode(cell, 'utf-8') for cell in line)
126 |                 lines.append(line)
127 |             return lines
128 | 


--------------------------------------------------------------------------------
/transformers/configuration_albert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ ALBERT model configuration """
 17 | 
 18 | from .configuration_utils import PretrainedConfig
 19 | 
 20 | ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 21 |     'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json",
 22 |     'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json",
 23 |     'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json",
 24 |     'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json",
 25 |     'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json",
 26 |     'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json",
 27 |     'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json",
 28 |     'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json",
 29 | }
 30 | 
 31 | class AlbertConfig(PretrainedConfig):
 32 |     """Configuration for `AlbertModel`.
 33 | 
 34 |     The default settings match the configuration of model `albert_xxlarge`.
 35 |     """
 36 | 
 37 |     pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 38 | 
 39 |     def __init__(self,
 40 |                  vocab_size_or_config_json_file=30000,
 41 |                  embedding_size=128,
 42 |                  hidden_size=4096,
 43 |                  num_hidden_layers=12,
 44 |                  num_hidden_groups=1,
 45 |                  num_attention_heads=64,
 46 |                  intermediate_size=16384,
 47 |                  inner_group_num=1,
 48 |                  hidden_act="gelu_new",
 49 |                  hidden_dropout_prob=0,
 50 |                  attention_probs_dropout_prob=0,
 51 |                  max_position_embeddings=512,
 52 |                  type_vocab_size=2,
 53 |                  initializer_range=0.02,
 54 |                  layer_norm_eps=1e-12, **kwargs):
 55 |         """Constructs AlbertConfig.
 56 | 
 57 |         Args:
 58 |             vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
 59 |             embedding_size: size of voc embeddings.
 60 |             hidden_size: Size of the encoder layers and the pooler layer.
 61 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
 62 |             num_hidden_groups: Number of group for the hidden layers, parameters in
 63 |                 the same group are shared.
 64 |             num_attention_heads: Number of attention heads for each attention layer in
 65 |                 the Transformer encoder.
 66 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 67 |                 layer in the Transformer encoder.
 68 |             inner_group_num: int, number of inner repetition of attention and ffn.
 69 |             down_scale_factor: float, the scale to apply
 70 |             hidden_act: The non-linear activation function (function or string) in the
 71 |                 encoder and pooler.
 72 |             hidden_dropout_prob: The dropout probability for all fully connected
 73 |                 layers in the embeddings, encoder, and pooler.
 74 |             attention_probs_dropout_prob: The dropout ratio for the attention
 75 |                 probabilities.
 76 |             max_position_embeddings: The maximum sequence length that this model might
 77 |                 ever be used with. Typically set this to something large just in case
 78 |                 (e.g., 512 or 1024 or 2048).
 79 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 80 |                 `AlbertModel`.
 81 |             initializer_range: The stdev of the truncated_normal_initializer for
 82 |                 initializing all weight matrices.
 83 |         """
 84 |         super(AlbertConfig, self).__init__(**kwargs)
 85 | 
 86 |         self.vocab_size = vocab_size_or_config_json_file
 87 |         self.embedding_size = embedding_size
 88 |         self.hidden_size = hidden_size
 89 |         self.num_hidden_layers = num_hidden_layers
 90 |         self.num_hidden_groups = num_hidden_groups
 91 |         self.num_attention_heads = num_attention_heads
 92 |         self.inner_group_num = inner_group_num
 93 |         self.hidden_act = hidden_act
 94 |         self.intermediate_size = intermediate_size
 95 |         self.hidden_dropout_prob = hidden_dropout_prob
 96 |         self.attention_probs_dropout_prob = attention_probs_dropout_prob
 97 |         self.max_position_embeddings = max_position_embeddings
 98 |         self.type_vocab_size = type_vocab_size
 99 |         self.initializer_range = initializer_range
100 |         self.layer_norm_eps = layer_norm_eps


--------------------------------------------------------------------------------
/transformers/configuration_openai.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ OpenAI GPT configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"
 31 | }
 32 | 
 33 | class OpenAIGPTConfig(PretrainedConfig):
 34 |     """
 35 |     Configuration class to store the configuration of a `OpenAIGPTModel`.
 36 | 
 37 |     Args:
 38 |         vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
 39 |         n_positions: Number of positional embeddings.
 40 |         n_ctx: Size of the causal mask (usually same as n_positions).
 41 |         n_embd: Dimensionality of the embeddings and hidden states.
 42 |         n_layer: Number of hidden layers in the Transformer encoder.
 43 |         n_head: Number of attention heads for each attention layer in
 44 |             the Transformer encoder.
 45 |         afn: The non-linear activation function (function or string) in the
 46 |             encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 47 |         resid_pdrop: The dropout probabilitiy for all fully connected
 48 |             layers in the embeddings, encoder, and pooler.
 49 |         attn_pdrop: The dropout ratio for the attention
 50 |             probabilities.
 51 |         embd_pdrop: The dropout ratio for the embeddings.
 52 |         layer_norm_epsilon: epsilon to use in the layer norm layers
 53 |         initializer_range: The sttdev of the truncated_normal_initializer for
 54 |             initializing all weight matrices.
 55 |         predict_special_tokens: should we predict special tokens (when the model has a LM head)
 56 |     """
 57 |     pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 58 | 
 59 |     def __init__(
 60 |         self,
 61 |         vocab_size_or_config_json_file=40478,
 62 |         n_positions=512,
 63 |         n_ctx=512,
 64 |         n_embd=768,
 65 |         n_layer=12,
 66 |         n_head=12,
 67 |         afn="gelu",
 68 |         resid_pdrop=0.1,
 69 |         embd_pdrop=0.1,
 70 |         attn_pdrop=0.1,
 71 |         layer_norm_epsilon=1e-5,
 72 |         initializer_range=0.02,
 73 |         predict_special_tokens=True,
 74 | 
 75 |         num_labels=1,
 76 |         summary_type='cls_index',
 77 |         summary_use_proj=True,
 78 |         summary_activation=None,
 79 |         summary_proj_to_labels=True,
 80 |         summary_first_dropout=0.1,
 81 |         **kwargs
 82 |     ):
 83 |         """Constructs OpenAIGPTConfig.
 84 |         """
 85 |         super(OpenAIGPTConfig, self).__init__(**kwargs)
 86 | 
 87 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
 88 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
 89 |             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
 90 |                 json_config = json.loads(reader.read())
 91 |             for key, value in json_config.items():
 92 |                 self.__dict__[key] = value
 93 |         elif isinstance(vocab_size_or_config_json_file, int):
 94 |             self.vocab_size = vocab_size_or_config_json_file
 95 |             self.n_ctx = n_ctx
 96 |             self.n_positions = n_positions
 97 |             self.n_embd = n_embd
 98 |             self.n_layer = n_layer
 99 |             self.n_head = n_head
100 |             self.afn = afn
101 |             self.resid_pdrop = resid_pdrop
102 |             self.embd_pdrop = embd_pdrop
103 |             self.attn_pdrop = attn_pdrop
104 |             self.layer_norm_epsilon = layer_norm_epsilon
105 |             self.initializer_range = initializer_range
106 |             self.predict_special_tokens = predict_special_tokens
107 | 
108 |             self.num_labels = num_labels
109 |             self.summary_type = summary_type
110 |             self.summary_use_proj = summary_use_proj
111 |             self.summary_activation = summary_activation
112 |             self.summary_first_dropout = summary_first_dropout
113 |             self.summary_proj_to_labels = summary_proj_to_labels
114 |         else:
115 |             raise ValueError(
116 |                 "First argument must be either a vocabulary size (int)"
117 |                 "or the path to a pretrained model config file (str)"
118 |             )
119 | 
120 |     @property
121 |     def max_position_embeddings(self):
122 |         return self.n_positions
123 | 
124 |     @property
125 |     def hidden_size(self):
126 |         return self.n_embd
127 | 
128 |     @property
129 |     def num_attention_heads(self):
130 |         return self.n_head
131 | 
132 |     @property
133 |     def num_hidden_layers(self):
134 |         return self.n_layer
135 | 


--------------------------------------------------------------------------------
/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert Transformer XL checkpoint and datasets."""
 16 | 
 17 | from __future__ import absolute_import, division, print_function
 18 | 
 19 | import argparse
 20 | import os
 21 | import sys
 22 | from io import open
 23 | 
 24 | import torch
 25 | 
 26 | import transformers.tokenization_transfo_xl as data_utils
 27 | 
 28 | from transformers import CONFIG_NAME, WEIGHTS_NAME
 29 | from transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
 30 |                                                       load_tf_weights_in_transfo_xl)
 31 | from transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
 32 | 
 33 | if sys.version_info[0] == 2:
 34 |     import cPickle as pickle
 35 | else:
 36 |     import pickle
 37 | 
 38 | import logging
 39 | logging.basicConfig(level=logging.INFO)
 40 | 
 41 | # We do this to be able to load python 2 datasets pickles
 42 | # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
 43 | data_utils.Vocab = data_utils.TransfoXLTokenizer
 44 | data_utils.Corpus = data_utils.TransfoXLCorpus
 45 | sys.modules['data_utils'] = data_utils
 46 | sys.modules['vocabulary'] = data_utils
 47 | 
 48 | def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
 49 |                                              transfo_xl_config_file,
 50 |                                              pytorch_dump_folder_path,
 51 |                                              transfo_xl_dataset_file):
 52 |     if transfo_xl_dataset_file:
 53 |         # Convert a pre-processed corpus (see original TensorFlow repo)
 54 |         with open(transfo_xl_dataset_file, "rb") as fp:
 55 |             corpus = pickle.load(fp, encoding="latin1")
 56 |         # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
 57 |         pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['pretrained_vocab_file']
 58 |         print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
 59 |         corpus_vocab_dict = corpus.vocab.__dict__
 60 |         torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
 61 | 
 62 |         corpus_dict_no_vocab = corpus.__dict__
 63 |         corpus_dict_no_vocab.pop('vocab', None)
 64 |         pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME
 65 |         print("Save dataset to {}".format(pytorch_dataset_dump_path))
 66 |         torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
 67 | 
 68 |     if tf_checkpoint_path:
 69 |         # Convert a pre-trained TensorFlow model
 70 |         config_path = os.path.abspath(transfo_xl_config_file)
 71 |         tf_path = os.path.abspath(tf_checkpoint_path)
 72 | 
 73 |         print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
 74 |         # Initialise PyTorch model
 75 |         if transfo_xl_config_file == "":
 76 |             config = TransfoXLConfig()
 77 |         else:
 78 |             config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
 79 |         print("Building PyTorch model from configuration: {}".format(str(config)))
 80 |         model = TransfoXLLMHeadModel(config)
 81 | 
 82 |         model = load_tf_weights_in_transfo_xl(model, config, tf_path)
 83 |         # Save pytorch-model
 84 |         pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
 85 |         pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
 86 |         print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
 87 |         torch.save(model.state_dict(), pytorch_weights_dump_path)
 88 |         print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
 89 |         with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
 90 |             f.write(config.to_json_string())
 91 | 
 92 | 
 93 | if __name__ == "__main__":
 94 |     parser = argparse.ArgumentParser()
 95 |     parser.add_argument("--pytorch_dump_folder_path",
 96 |                         default = None,
 97 |                         type = str,
 98 |                         required = True,
 99 |                         help = "Path to the folder to store the PyTorch model or dataset/vocab.")
100 |     parser.add_argument("--tf_checkpoint_path",
101 |                         default = "",
102 |                         type = str,
103 |                         help = "An optional path to a TensorFlow checkpoint path to be converted.")
104 |     parser.add_argument("--transfo_xl_config_file",
105 |                         default = "",
106 |                         type = str,
107 |                         help = "An optional config json file corresponding to the pre-trained BERT model. \n"
108 |                             "This specifies the model architecture.")
109 |     parser.add_argument("--transfo_xl_dataset_file",
110 |                         default = "",
111 |                         type = str,
112 |                         help = "An optional dataset file to be converted in a vocabulary.")
113 |     args = parser.parse_args()
114 |     convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
115 |                                      args.transfo_xl_config_file,
116 |                                      args.pytorch_dump_folder_path,
117 |                                      args.transfo_xl_dataset_file)
118 | 


--------------------------------------------------------------------------------
/transformers/commands/user.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser
  2 | from getpass import getpass
  3 | import os
  4 | 
  5 | from transformers.commands import BaseTransformersCLICommand
  6 | from transformers.hf_api import HfApi, HfFolder, HTTPError
  7 | 
  8 | 
  9 | class UserCommands(BaseTransformersCLICommand):
 10 |     @staticmethod
 11 |     def register_subcommand(parser: ArgumentParser):
 12 |         login_parser = parser.add_parser('login')
 13 |         login_parser.set_defaults(func=lambda args: LoginCommand(args))
 14 |         whoami_parser = parser.add_parser('whoami')
 15 |         whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args))
 16 |         logout_parser = parser.add_parser('logout')
 17 |         logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
 18 |         list_parser = parser.add_parser('ls')
 19 |         list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
 20 |         # upload
 21 |         upload_parser = parser.add_parser('upload')
 22 |         upload_parser.add_argument('file', type=str, help='Local filepath of the file to upload.')
 23 |         upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override object filename on S3.')
 24 |         upload_parser.set_defaults(func=lambda args: UploadCommand(args))
 25 | 
 26 | 
 27 | 
 28 | class ANSI:
 29 |     """
 30 |     Helper for en.wikipedia.org/wiki/ANSI_escape_code
 31 |     """
 32 |     _bold = u"\u001b[1m"
 33 |     _reset = u"\u001b[0m"
 34 |     @classmethod
 35 |     def bold(cls, s):
 36 |         return "{}{}{}".format(cls._bold, s, cls._reset)
 37 | 
 38 | 
 39 | class BaseUserCommand:
 40 |     def __init__(self, args):
 41 |         self.args = args
 42 |         self._api = HfApi()
 43 | 
 44 | 
 45 | class LoginCommand(BaseUserCommand):
 46 |     def run(self):
 47 |         print("""
 48 |         _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|  
 49 |         _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|        
 50 |         _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|    
 51 |         _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|        
 52 |         _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|  
 53 | 
 54 |         """)
 55 |         username = input("Username: ")
 56 |         password = getpass()
 57 |         try:
 58 |             token = self._api.login(username, password)
 59 |         except HTTPError as e:
 60 |             # probably invalid credentials, display error message.
 61 |             print(e)
 62 |             exit(1)
 63 |         HfFolder.save_token(token)
 64 |         print("Login successful")
 65 |         print("Your token:", token, "\n")
 66 |         print("Your token has been saved to", HfFolder.path_token)
 67 | 
 68 | 
 69 | class WhoamiCommand(BaseUserCommand):
 70 |     def run(self):
 71 |         token = HfFolder.get_token()
 72 |         if token is None:
 73 |             print("Not logged in")
 74 |             exit()
 75 |         try:
 76 |             user = self._api.whoami(token)
 77 |             print(user)
 78 |         except HTTPError as e:
 79 |             print(e)
 80 | 
 81 | 
 82 | class LogoutCommand(BaseUserCommand):
 83 |     def run(self):
 84 |         token = HfFolder.get_token()
 85 |         if token is None:
 86 |             print("Not logged in")
 87 |             exit()
 88 |         HfFolder.delete_token()
 89 |         self._api.logout(token)
 90 |         print("Successfully logged out.")
 91 | 
 92 | 
 93 | class ListObjsCommand(BaseUserCommand):
 94 |     def tabulate(self, rows, headers):
 95 |         # type: (List[List[Union[str, int]]], List[str]) -> str
 96 |         """
 97 |         Inspired by:
 98 |         stackoverflow.com/a/8356620/593036
 99 |         stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
100 |         """
101 |         col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
102 |         row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
103 |         lines = []
104 |         lines.append(
105 |             row_format.format(*headers)
106 |         )
107 |         lines.append(
108 |             row_format.format(*["-" * w for w in col_widths])
109 |         )
110 |         for row in rows:
111 |             lines.append(
112 |                 row_format.format(*row)
113 |             )
114 |         return "\n".join(lines)
115 | 
116 |     def run(self):
117 |         token = HfFolder.get_token()
118 |         if token is None:
119 |             print("Not logged in")
120 |             exit(1)
121 |         try:
122 |             objs = self._api.list_objs(token)
123 |         except HTTPError as e:
124 |             print(e)
125 |             exit(1)
126 |         if len(objs) == 0:
127 |             print("No shared file yet")
128 |             exit()
129 |         rows = [ [
130 |             obj.filename,
131 |             obj.LastModified,
132 |             obj.ETag,
133 |             obj.Size
134 |         ] for obj in objs ]
135 |         print(
136 |             self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"])
137 |         )
138 | 
139 | 
140 | class UploadCommand(BaseUserCommand):
141 |     def run(self):
142 |         token = HfFolder.get_token()
143 |         if token is None:
144 |             print("Not logged in")
145 |             exit(1)
146 |         filepath = os.path.join(os.getcwd(), self.args.file)
147 |         filename = self.args.filename if self.args.filename is not None else os.path.basename(filepath)
148 |         print(
149 |             "About to upload file {} to S3 under filename {}".format(
150 |                 ANSI.bold(filepath), ANSI.bold(filename)
151 |             )
152 |         )
153 | 
154 |         choice = input("Proceed? [Y/n] ").lower()
155 |         if not(choice == "" or choice == "y" or choice == "yes"):
156 |             print("Abort")
157 |             exit()
158 |         print(
159 |             ANSI.bold("Uploading... This might take a while if file is large")
160 |         )
161 |         access_url = self._api.presign_and_upload(
162 |             token=token, filename=filename, filepath=filepath
163 |         )
164 |         print("Your file now lives at:")
165 |         print(access_url)
166 | 


--------------------------------------------------------------------------------
/transformers/configuration_ctrl.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Salesforce and HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Salesforce CTRL configuration """
 16 | 
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | 
 19 | import json
 20 | import logging
 21 | import sys
 22 | from io import open
 23 | 
 24 | from .configuration_utils import PretrainedConfig
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"}
 29 | 
 30 | class CTRLConfig(PretrainedConfig):
 31 |     """Configuration class to store the configuration of a `CTRLModel`.
 32 | 
 33 |     Args:
 34 |         vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
 35 |         n_positions: Number of positional embeddings.
 36 |         n_ctx: Size of the causal mask (usually same as n_positions).
 37 |         dff: Size of the inner dimension of the FFN.
 38 |         n_embd: Dimensionality of the embeddings and hidden states.
 39 |         n_layer: Number of hidden layers in the Transformer encoder.
 40 |         n_head: Number of attention heads for each attention layer in
 41 |             the Transformer encoder.
 42 |         layer_norm_epsilon: epsilon to use in the layer norm layers
 43 |         resid_pdrop: The dropout probabilitiy for all fully connected
 44 |             layers in the embeddings, encoder, and pooler.
 45 |         attn_pdrop: The dropout ratio for the attention
 46 |             probabilities.
 47 |         embd_pdrop: The dropout ratio for the embeddings.
 48 |         initializer_range: The sttdev of the truncated_normal_initializer for
 49 |             initializing all weight matrices.
 50 |     """
 51 |     pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
 52 | 
 53 |     def __init__(
 54 |         self,
 55 |         vocab_size_or_config_json_file=246534,
 56 |         n_positions=256,
 57 |         n_ctx=256,
 58 |         n_embd=1280,
 59 |         dff=8192,
 60 |         n_layer=48,
 61 |         n_head=16,
 62 |         resid_pdrop=0.1,
 63 |         embd_pdrop=0.1,
 64 |         attn_pdrop=0.1,
 65 |         layer_norm_epsilon=1e-6,
 66 |         initializer_range=0.02,
 67 | 
 68 |         num_labels=1,
 69 |         summary_type='cls_index',
 70 |         summary_use_proj=True,
 71 |         summary_activation=None,
 72 |         summary_proj_to_labels=True,
 73 |         summary_first_dropout=0.1,
 74 |         **kwargs
 75 |     ):
 76 |         """Constructs CTRLConfig.
 77 | 
 78 |         Args:
 79 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
 80 |             n_positions: Number of positional embeddings.
 81 |             n_ctx: Size of the causal mask (usually same as n_positions).
 82 |             dff: Size of the inner dimension of the FFN.
 83 |             n_embd: Dimensionality of the embeddings and hidden states.
 84 |             n_layer: Number of hidden layers in the Transformer encoder.
 85 |             n_head: Number of attention heads for each attention layer in
 86 |                 the Transformer encoder.
 87 |             layer_norm_epsilon: epsilon to use in the layer norm layers
 88 |             resid_pdrop: The dropout probabilitiy for all fully connected
 89 |                 layers in the embeddings, encoder, and pooler.
 90 |             attn_pdrop: The dropout ratio for the attention
 91 |                 probabilities.
 92 |             embd_pdrop: The dropout ratio for the embeddings.
 93 |             initializer_range: The sttdev of the truncated_normal_initializer for
 94 |                 initializing all weight matrices.
 95 |         """
 96 |         super(CTRLConfig, self).__init__(**kwargs)
 97 | 
 98 |         self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
 99 |         self.n_ctx = n_ctx
100 |         self.n_positions = n_positions
101 |         self.n_embd = n_embd
102 |         self.n_layer = n_layer
103 |         self.n_head = n_head
104 |         self.dff = dff
105 |         self.resid_pdrop = resid_pdrop
106 |         self.embd_pdrop = embd_pdrop
107 |         self.attn_pdrop = attn_pdrop
108 |         self.layer_norm_epsilon = layer_norm_epsilon
109 |         self.initializer_range = initializer_range
110 | 
111 |         self.num_labels = num_labels
112 |         self.summary_type = summary_type
113 |         self.summary_use_proj = summary_use_proj
114 |         self.summary_activation = summary_activation
115 |         self.summary_first_dropout = summary_first_dropout
116 |         self.summary_proj_to_labels = summary_proj_to_labels
117 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
118 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
119 |             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
120 |                 json_config = json.loads(reader.read())
121 |             for key, value in json_config.items():
122 |                 self.__dict__[key] = value
123 |         elif not isinstance(vocab_size_or_config_json_file, int):
124 |             raise ValueError(
125 |                 "First argument must be either a vocabulary size (int)"
126 |                 "or the path to a pretrained model config file (str)"
127 |             )
128 | 
129 |     @property
130 |     def max_position_embeddings(self):
131 |         return self.n_positions
132 | 
133 |     @property
134 |     def hidden_size(self):
135 |         return self.n_embd
136 | 
137 |     @property
138 |     def num_attention_heads(self):
139 |         return self.n_head
140 | 
141 |     @property
142 |     def num_hidden_layers(self):
143 |         return self.n_layer
144 | 


--------------------------------------------------------------------------------
/transformers/configuration_gpt2.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ OpenAI GPT-2 configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
 30 |                                       "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
 31 |                                       "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
 32 |                                       "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json",
 33 |                                       "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}
 34 | 
 35 | class GPT2Config(PretrainedConfig):
 36 |     """Configuration class to store the configuration of a `GPT2Model`.
 37 | 
 38 |     Args:
 39 |         vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
 40 |         n_positions: Number of positional embeddings.
 41 |         n_ctx: Size of the causal mask (usually same as n_positions).
 42 |         n_embd: Dimensionality of the embeddings and hidden states.
 43 |         n_layer: Number of hidden layers in the Transformer encoder.
 44 |         n_head: Number of attention heads for each attention layer in
 45 |             the Transformer encoder.
 46 |         layer_norm_epsilon: epsilon to use in the layer norm layers
 47 |         resid_pdrop: The dropout probabilitiy for all fully connected
 48 |             layers in the embeddings, encoder, and pooler.
 49 |         attn_pdrop: The dropout ratio for the attention
 50 |             probabilities.
 51 |         embd_pdrop: The dropout ratio for the embeddings.
 52 |         initializer_range: The sttdev of the truncated_normal_initializer for
 53 |             initializing all weight matrices.
 54 |     """
 55 |     pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 56 | 
 57 |     def __init__(
 58 |         self,
 59 |         vocab_size_or_config_json_file=50257,
 60 |         n_positions=1024,
 61 |         n_ctx=1024,
 62 |         n_embd=768,
 63 |         n_layer=12,
 64 |         n_head=12,
 65 |         resid_pdrop=0.1,
 66 |         embd_pdrop=0.1,
 67 |         attn_pdrop=0.1,
 68 |         layer_norm_epsilon=1e-5,
 69 |         initializer_range=0.02,
 70 | 
 71 |         num_labels=1,
 72 |         summary_type='cls_index',
 73 |         summary_use_proj=True,
 74 |         summary_activation=None,
 75 |         summary_proj_to_labels=True,
 76 |         summary_first_dropout=0.1,
 77 |         **kwargs
 78 |     ):
 79 |         """Constructs GPT2Config.
 80 | 
 81 |         Args:
 82 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
 83 |             n_positions: Number of positional embeddings.
 84 |             n_ctx: Size of the causal mask (usually same as n_positions).
 85 |             n_embd: Dimensionality of the embeddings and hidden states.
 86 |             n_layer: Number of hidden layers in the Transformer encoder.
 87 |             n_head: Number of attention heads for each attention layer in
 88 |                 the Transformer encoder.
 89 |             layer_norm_epsilon: epsilon to use in the layer norm layers
 90 |             resid_pdrop: The dropout probabilitiy for all fully connected
 91 |                 layers in the embeddings, encoder, and pooler.
 92 |             attn_pdrop: The dropout ratio for the attention
 93 |                 probabilities.
 94 |             embd_pdrop: The dropout ratio for the embeddings.
 95 |             initializer_range: The sttdev of the truncated_normal_initializer for
 96 |                 initializing all weight matrices.
 97 |         """
 98 |         super(GPT2Config, self).__init__(**kwargs)
 99 | 
100 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
101 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
102 |             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
103 |                 json_config = json.loads(reader.read())
104 |             for key, value in json_config.items():
105 |                 self.__dict__[key] = value
106 |         elif isinstance(vocab_size_or_config_json_file, int):
107 |             self.vocab_size = vocab_size_or_config_json_file
108 |             self.n_ctx = n_ctx
109 |             self.n_positions = n_positions
110 |             self.n_embd = n_embd
111 |             self.n_layer = n_layer
112 |             self.n_head = n_head
113 |             self.resid_pdrop = resid_pdrop
114 |             self.embd_pdrop = embd_pdrop
115 |             self.attn_pdrop = attn_pdrop
116 |             self.layer_norm_epsilon = layer_norm_epsilon
117 |             self.initializer_range = initializer_range
118 | 
119 |             self.num_labels = num_labels
120 |             self.summary_type = summary_type
121 |             self.summary_use_proj = summary_use_proj
122 |             self.summary_activation = summary_activation
123 |             self.summary_first_dropout = summary_first_dropout
124 |             self.summary_proj_to_labels = summary_proj_to_labels
125 |         else:
126 |             raise ValueError(
127 |                 "First argument must be either a vocabulary size (int)"
128 |                 "or the path to a pretrained model config file (str)"
129 |             )
130 | 
131 |     @property
132 |     def max_position_embeddings(self):
133 |         return self.n_positions
134 | 
135 |     @property
136 |     def hidden_size(self):
137 |         return self.n_embd
138 | 
139 |     @property
140 |     def num_attention_heads(self):
141 |         return self.n_head
142 | 
143 |     @property
144 |     def num_hidden_layers(self):
145 |         return self.n_layer
146 | 


--------------------------------------------------------------------------------
/transformers/configuration_xlnet.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ XLNet configuration """
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | 
 19 | import json
 20 | import logging
 21 | import sys
 22 | from io import open
 23 | 
 24 | from .configuration_utils import PretrainedConfig
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 29 |     'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
 30 |     'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
 31 | }
 32 | 
 33 | 
 34 | class XLNetConfig(PretrainedConfig):
 35 |     """Configuration class to store the configuration of a ``XLNetModel``.
 36 | 
 37 |     Args:
 38 |         vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
 39 |         d_model: Size of the encoder layers and the pooler layer.
 40 |         n_layer: Number of hidden layers in the Transformer encoder.
 41 |         n_head: Number of attention heads for each attention layer in
 42 |             the Transformer encoder.
 43 |         d_inner: The size of the "intermediate" (i.e., feed-forward)
 44 |             layer in the Transformer encoder.
 45 |         ff_activation: The non-linear activation function (function or string) in the
 46 |             encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 47 |         untie_r: untie relative position biases
 48 |         attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
 49 | 
 50 |         dropout: The dropout probabilitiy for all fully connected
 51 |             layers in the embeddings, encoder, and pooler.
 52 |         initializer_range: The sttdev of the truncated_normal_initializer for
 53 |             initializing all weight matrices.
 54 |         layer_norm_eps: The epsilon used by LayerNorm.
 55 | 
 56 |         dropout: float, dropout rate.
 57 |         init: str, the initialization scheme, either "normal" or "uniform".
 58 |         init_range: float, initialize the parameters with a uniform distribution
 59 |             in [-init_range, init_range]. Only effective when init="uniform".
 60 |         init_std: float, initialize the parameters with a normal distribution
 61 |             with mean 0 and stddev init_std. Only effective when init="normal".
 62 |         mem_len: int, the number of tokens to cache.
 63 |         reuse_len: int, the number of tokens in the currect batch to be cached
 64 |             and reused in the future.
 65 |         bi_data: bool, whether to use bidirectional input pipeline.
 66 |             Usually set to True during pretraining and False during finetuning.
 67 |         clamp_len: int, clamp all relative distances larger than clamp_len.
 68 |             -1 means no clamping.
 69 |         same_length: bool, whether to use the same attention length for each token.
 70 |         finetuning_task: name of the glue task on which the model was fine-tuned if any
 71 |     """
 72 |     pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
 73 | 
 74 |     def __init__(self,
 75 |                  vocab_size_or_config_json_file=32000,
 76 |                  d_model=1024,
 77 |                  n_layer=24,
 78 |                  n_head=16,
 79 |                  d_inner=4096,
 80 |                  max_position_embeddings=512,
 81 |                  ff_activation="gelu",
 82 |                  untie_r=True,
 83 |                  attn_type="bi",
 84 | 
 85 |                  initializer_range=0.02,
 86 |                  layer_norm_eps=1e-12,
 87 | 
 88 |                  dropout=0.1,
 89 |                  mem_len=None,
 90 |                  reuse_len=None,
 91 |                  bi_data=False,
 92 |                  clamp_len=-1,
 93 |                  same_length=False,
 94 | 
 95 |                  finetuning_task=None,
 96 |                  num_labels=2,
 97 |                  summary_type='last',
 98 |                  summary_use_proj=True,
 99 |                  summary_activation='tanh',
100 |                  summary_last_dropout=0.1,
101 |                  start_n_top=5,
102 |                  end_n_top=5,
103 |                  **kwargs):
104 |         """Constructs XLNetConfig.
105 |         """
106 |         super(XLNetConfig, self).__init__(**kwargs)
107 | 
108 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
109 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
110 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
111 |                 json_config = json.loads(reader.read())
112 |             for key, value in json_config.items():
113 |                 setattr(config, key, value)
114 |         elif isinstance(vocab_size_or_config_json_file, int):
115 |             self.n_token = vocab_size_or_config_json_file
116 |             self.d_model = d_model
117 |             self.n_layer = n_layer
118 |             self.n_head = n_head
119 |             assert d_model % n_head == 0
120 |             self.d_head = d_model // n_head
121 |             self.ff_activation = ff_activation
122 |             self.d_inner = d_inner
123 |             self.untie_r = untie_r
124 |             self.attn_type = attn_type
125 | 
126 |             self.initializer_range = initializer_range
127 |             self.layer_norm_eps = layer_norm_eps
128 | 
129 |             self.dropout = dropout
130 |             self.mem_len = mem_len
131 |             self.reuse_len = reuse_len
132 |             self.bi_data = bi_data
133 |             self.clamp_len = clamp_len
134 |             self.same_length = same_length
135 | 
136 |             self.finetuning_task = finetuning_task
137 |             self.num_labels = num_labels
138 |             self.summary_type = summary_type
139 |             self.summary_use_proj = summary_use_proj
140 |             self.summary_activation = summary_activation
141 |             self.summary_last_dropout = summary_last_dropout
142 |             self.start_n_top = start_n_top
143 |             self.end_n_top = end_n_top
144 |         else:
145 |             raise ValueError("First argument must be either a vocabulary size (int)"
146 |                              " or the path to a pretrained model config file (str)")
147 | 
148 |     @property
149 |     def max_position_embeddings(self):
150 |         return -1
151 | 
152 |     @property
153 |     def vocab_size(self):
154 |         return self.n_token
155 | 
156 |     @vocab_size.setter
157 |     def vocab_size(self, value):
158 |         self.n_token = value
159 | 
160 |     @property
161 |     def hidden_size(self):
162 |         return self.d_model
163 | 
164 |     @property
165 |     def num_attention_heads(self):
166 |         return self.n_head
167 | 
168 |     @property
169 |     def num_hidden_layers(self):
170 |         return self.n_layer
171 | 


--------------------------------------------------------------------------------
/transformers/configuration_transfo_xl.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Transformer XL configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
 31 | }
 32 | 
 33 | class TransfoXLConfig(PretrainedConfig):
 34 |     """Configuration class to store the configuration of a `TransfoXLModel`.
 35 | 
 36 |         Args:
 37 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
 38 |             cutoffs: cutoffs for the adaptive softmax
 39 |             d_model: Dimensionality of the model's hidden states.
 40 |             d_embed: Dimensionality of the embeddings
 41 |             d_head: Dimensionality of the model's heads.
 42 |             div_val: divident value for adapative input and softmax
 43 |             pre_lnorm: apply LayerNorm to the input instead of the output
 44 |             d_inner: Inner dimension in FF
 45 |             n_layer: Number of hidden layers in the Transformer encoder.
 46 |             n_head: Number of attention heads for each attention layer in
 47 |                 the Transformer encoder.
 48 |             tgt_len: number of tokens to predict
 49 |             ext_len: length of the extended context
 50 |             mem_len: length of the retained previous heads
 51 |             same_length: use the same attn length for all tokens
 52 |             proj_share_all_but_first: True to share all but first projs, False not to share.
 53 |             attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
 54 |             clamp_len: use the same pos embeddings after clamp_len
 55 |             sample_softmax: number of samples in sampled softmax
 56 |             adaptive: use adaptive softmax
 57 |             tie_weight: tie the word embedding and softmax weights
 58 |             dropout: The dropout probabilitiy for all fully connected
 59 |                 layers in the embeddings, encoder, and pooler.
 60 |             dropatt: The dropout ratio for the attention probabilities.
 61 |             untie_r: untie relative position biases
 62 |             embd_pdrop: The dropout ratio for the embeddings.
 63 |             init: parameter initializer to use
 64 |             init_range: parameters initialized by U(-init_range, init_range).
 65 |             proj_init_std: parameters initialized by N(0, init_std)
 66 |             init_std: parameters initialized by N(0, init_std)
 67 |     """
 68 |     pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 69 | 
 70 |     def __init__(self,
 71 |                  vocab_size_or_config_json_file=267735,
 72 |                  cutoffs=[20000, 40000, 200000],
 73 |                  d_model=1024,
 74 |                  d_embed=1024,
 75 |                  n_head=16,
 76 |                  d_head=64,
 77 |                  d_inner=4096,
 78 |                  div_val=4,
 79 |                  pre_lnorm=False,
 80 |                  n_layer=18,
 81 |                  tgt_len=128,
 82 |                  ext_len=0,
 83 |                  mem_len=1600,
 84 |                  clamp_len=1000,
 85 |                  same_length=True,
 86 |                  proj_share_all_but_first=True,
 87 |                  attn_type=0,
 88 |                  sample_softmax=-1,
 89 |                  adaptive=True,
 90 |                  tie_weight=True,
 91 |                  dropout=0.1,
 92 |                  dropatt=0.0,
 93 |                  untie_r=True,
 94 |                  init="normal",
 95 |                  init_range=0.01,
 96 |                  proj_init_std=0.01,
 97 |                  init_std=0.02,
 98 |                  layer_norm_epsilon=1e-5,
 99 |                  **kwargs):
100 |         """Constructs TransfoXLConfig.
101 |         """
102 |         super(TransfoXLConfig, self).__init__(**kwargs)
103 |         self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
104 |         self.cutoffs = []
105 |         self.cutoffs.extend(cutoffs)
106 |         self.tie_weight = tie_weight
107 |         if proj_share_all_but_first:
108 |             self.tie_projs = [False] + [True] * len(self.cutoffs)
109 |         else:
110 |             self.tie_projs = [False] + [False] * len(self.cutoffs)
111 |         self.d_model = d_model
112 |         self.d_embed = d_embed
113 |         self.d_head = d_head
114 |         self.d_inner = d_inner
115 |         self.div_val = div_val
116 |         self.pre_lnorm = pre_lnorm
117 |         self.n_layer = n_layer
118 |         self.n_head = n_head
119 |         self.tgt_len = tgt_len
120 |         self.ext_len = ext_len
121 |         self.mem_len = mem_len
122 |         self.same_length = same_length
123 |         self.attn_type = attn_type
124 |         self.clamp_len = clamp_len
125 |         self.sample_softmax = sample_softmax
126 |         self.adaptive = adaptive
127 |         self.dropout = dropout
128 |         self.dropatt = dropatt
129 |         self.untie_r = untie_r
130 |         self.init = init
131 |         self.init_range = init_range
132 |         self.proj_init_std = proj_init_std
133 |         self.init_std = init_std
134 |         self.layer_norm_epsilon = layer_norm_epsilon
135 | 
136 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
137 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
138 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
139 |                 json_config = json.loads(reader.read())
140 |             for key, value in json_config.items():
141 |                 self.__dict__[key] = value
142 |         elif not isinstance(vocab_size_or_config_json_file, int):
143 |             raise ValueError("First argument must be either a vocabulary size (int)"
144 |                              " or the path to a pretrained model config file (str)")
145 | 
146 |     @property
147 |     def max_position_embeddings(self):
148 |         return self.tgt_len + self.ext_len + self.mem_len
149 | 
150 |     @property
151 |     def vocab_size(self):
152 |         return self.n_token
153 | 
154 |     @vocab_size.setter
155 |     def vocab_size(self, value):
156 |         self.n_token = value
157 | 
158 |     @property
159 |     def hidden_size(self):
160 |         return self.d_model
161 | 
162 |     @property
163 |     def num_attention_heads(self):
164 |         return self.n_head
165 | 
166 |     @property
167 |     def num_hidden_layers(self):
168 |         return self.n_layer
169 | 


--------------------------------------------------------------------------------
/transformers/__main__.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | def main():
  3 |     import sys
  4 |     if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
  5 |         print(
  6 |         "This command line utility let you convert original (author released) model checkpoint to pytorch.\n"
  7 |         "It should be used as one of: \n"
  8 |         ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
  9 |         ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
 10 |         ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
 11 |         ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
 12 |         ">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
 13 |         ">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
 14 |     else:
 15 |         if sys.argv[1] == "bert":
 16 |             try:
 17 |                 from .convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
 18 |             except ImportError:
 19 |                 print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 20 |                     "In that case, it requires TensorFlow to be installed. Please see "
 21 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 22 |                 raise
 23 | 
 24 |             if len(sys.argv) != 5:
 25 |                 # pylint: disable=line-too-long
 26 |                 print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
 27 |             else:
 28 |                 PYTORCH_DUMP_OUTPUT = sys.argv.pop()
 29 |                 TF_CONFIG = sys.argv.pop()
 30 |                 TF_CHECKPOINT = sys.argv.pop()
 31 |                 convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
 32 |         elif sys.argv[1] == "gpt":
 33 |             from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
 34 |             if len(sys.argv) < 4 or len(sys.argv) > 5:
 35 |                 # pylint: disable=line-too-long
 36 |                 print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
 37 |             else:
 38 |                 OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
 39 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
 40 |                 if len(sys.argv) == 5:
 41 |                     OPENAI_GPT_CONFIG = sys.argv[4]
 42 |                 else:
 43 |                     OPENAI_GPT_CONFIG = ""
 44 |                 convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
 45 |                                                     OPENAI_GPT_CONFIG,
 46 |                                                     PYTORCH_DUMP_OUTPUT)
 47 |         elif sys.argv[1] == "transfo_xl":
 48 |             try:
 49 |                 from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
 50 |             except ImportError:
 51 |                 print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 52 |                     "In that case, it requires TensorFlow to be installed. Please see "
 53 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 54 |                 raise
 55 |             if len(sys.argv) < 4 or len(sys.argv) > 5:
 56 |                 # pylint: disable=line-too-long
 57 |                 print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
 58 |             else:
 59 |                 if 'ckpt' in sys.argv[2].lower():
 60 |                     TF_CHECKPOINT = sys.argv[2]
 61 |                     TF_DATASET_FILE = ""
 62 |                 else:
 63 |                     TF_DATASET_FILE = sys.argv[2]
 64 |                     TF_CHECKPOINT = ""
 65 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
 66 |                 if len(sys.argv) == 5:
 67 |                     TF_CONFIG = sys.argv[4]
 68 |                 else:
 69 |                     TF_CONFIG = ""
 70 |                 convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
 71 |         elif sys.argv[1] == "gpt2":
 72 |             try:
 73 |                 from .convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
 74 |             except ImportError:
 75 |                 print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 76 |                     "In that case, it requires TensorFlow to be installed. Please see "
 77 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 78 |                 raise
 79 | 
 80 |             if len(sys.argv) < 4 or len(sys.argv) > 5:
 81 |                 # pylint: disable=line-too-long
 82 |                 print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
 83 |             else:
 84 |                 TF_CHECKPOINT = sys.argv[2]
 85 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
 86 |                 if len(sys.argv) == 5:
 87 |                     TF_CONFIG = sys.argv[4]
 88 |                 else:
 89 |                     TF_CONFIG = ""
 90 |                 convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
 91 |         elif sys.argv[1] == "xlnet":
 92 |             try:
 93 |                 from .convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
 94 |             except ImportError:
 95 |                 print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 96 |                     "In that case, it requires TensorFlow to be installed. Please see "
 97 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 98 |                 raise
 99 | 
100 |             if len(sys.argv) < 5 or len(sys.argv) > 6:
101 |                 # pylint: disable=line-too-long
102 |                 print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
103 |             else:
104 |                 TF_CHECKPOINT = sys.argv[2]
105 |                 TF_CONFIG = sys.argv[3]
106 |                 PYTORCH_DUMP_OUTPUT = sys.argv[4]
107 |                 if len(sys.argv) == 6:
108 |                     FINETUNING_TASK = sys.argv[5]
109 |                 else:
110 |                     FINETUNING_TASK = None
111 | 
112 |                 convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT,
113 |                                                     TF_CONFIG,
114 |                                                     PYTORCH_DUMP_OUTPUT,
115 |                                                     FINETUNING_TASK)
116 |         elif sys.argv[1] == "xlm":
117 |             from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
118 | 
119 |             if len(sys.argv) != 4:
120 |                 # pylint: disable=line-too-long
121 |                 print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
122 |             else:
123 |                 XLM_CHECKPOINT_PATH = sys.argv[2]
124 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
125 | 
126 |                 convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT)
127 | 
128 | if __name__ == '__main__':
129 |     main()
130 | 


--------------------------------------------------------------------------------
/transformers/hf_api.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019-present, the HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import, division, print_function
 16 | 
 17 | import os
 18 | from os.path import expanduser
 19 | 
 20 | import requests
 21 | import six
 22 | from requests.exceptions import HTTPError
 23 | from tqdm import tqdm
 24 | 
 25 | ENDPOINT = "https://huggingface.co"
 26 | 
 27 | class S3Obj:
 28 |     def __init__(
 29 |         self,
 30 |         filename,     # type: str
 31 |         LastModified, # type: str
 32 |         ETag,         # type: str
 33 |         Size,         # type: int
 34 |         **kwargs
 35 |     ):
 36 |         self.filename = filename
 37 |         self.LastModified = LastModified
 38 |         self.ETag = ETag
 39 |         self.Size = Size
 40 | 
 41 | 
 42 | class PresignedUrl:
 43 |     def __init__(
 44 |         self,
 45 |         write,  # type: str
 46 |         access, # type: str
 47 |         type,   # type: str
 48 |         **kwargs
 49 |     ):
 50 |         self.write = write
 51 |         self.access = access
 52 |         self.type = type # mime-type to send to S3.
 53 | 
 54 | 
 55 | class HfApi:
 56 |     def __init__(self, endpoint=None):
 57 |         self.endpoint = endpoint if endpoint is not None else ENDPOINT
 58 | 
 59 |     def login(
 60 |         self,
 61 |         username, # type: str
 62 |         password, # type: str
 63 |     ):
 64 |         # type: (...) -> str
 65 |         """
 66 |         Call HF API to sign in a user and get a token if credentials are valid.
 67 | 
 68 |         Outputs:
 69 |             token if credentials are valid
 70 | 
 71 |         Throws:
 72 |             requests.exceptions.HTTPError if credentials are invalid
 73 |         """
 74 |         path = "{}/api/login".format(self.endpoint)
 75 |         r = requests.post(path, json={"username": username, "password": password})
 76 |         r.raise_for_status()
 77 |         d = r.json()
 78 |         return d["token"]
 79 | 
 80 |     def whoami(
 81 |         self,
 82 |         token, # type: str
 83 |     ):
 84 |         # type: (...) -> str
 85 |         """
 86 |         Call HF API to know "whoami"
 87 |         """
 88 |         path = "{}/api/whoami".format(self.endpoint)
 89 |         r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
 90 |         r.raise_for_status()
 91 |         d = r.json()
 92 |         return d["user"]
 93 | 
 94 |     def logout(self, token):
 95 |         # type: (...) -> void
 96 |         """
 97 |         Call HF API to log out.
 98 |         """
 99 |         path = "{}/api/logout".format(self.endpoint)
100 |         r = requests.post(path, headers={"authorization": "Bearer {}".format(token)})
101 |         r.raise_for_status()
102 | 
103 |     def presign(self, token, filename):
104 |         # type: (...) -> PresignedUrl
105 |         """
106 |         Call HF API to get a presigned url to upload `filename` to S3.
107 |         """
108 |         path = "{}/api/presign".format(self.endpoint)
109 |         r = requests.post(
110 |             path,
111 |             headers={"authorization": "Bearer {}".format(token)},
112 |             json={"filename": filename},
113 |         )
114 |         r.raise_for_status()
115 |         d = r.json()
116 |         return PresignedUrl(**d)
117 | 
118 |     def presign_and_upload(self, token, filename, filepath):
119 |         # type: (...) -> str
120 |         """
121 |         Get a presigned url, then upload file to S3.
122 | 
123 |         Outputs:
124 |             url: Read-only url for the stored file on S3.
125 |         """
126 |         urls = self.presign(token, filename=filename)
127 |         # streaming upload:
128 |         # https://2.python-requests.org/en/master/user/advanced/#streaming-uploads
129 |         # 
130 |         # Even though we presign with the correct content-type,
131 |         # the client still has to specify it when uploading the file.
132 |         with open(filepath, "rb") as f:
133 |             pf = TqdmProgressFileReader(f)
134 | 
135 |             r = requests.put(urls.write, data=f, headers={
136 |                 "content-type": urls.type,
137 |             })
138 |             r.raise_for_status()
139 |             pf.close()
140 |         return urls.access
141 | 
142 |     def list_objs(self, token):
143 |         # type: (...) -> List[S3Obj]
144 |         """
145 |         Call HF API to list all stored files for user.
146 |         """
147 |         path = "{}/api/listObjs".format(self.endpoint)
148 |         r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
149 |         r.raise_for_status()
150 |         d = r.json()
151 |         return [S3Obj(**x) for x in d]
152 | 
153 | 
154 | 
155 | class TqdmProgressFileReader:
156 |     """
157 |     Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`)
158 |     and override `f.read()` so as to display a tqdm progress bar.
159 | 
160 |     see github.com/huggingface/transformers/pull/2078#discussion_r354739608
161 |     for implementation details.
162 |     """
163 |     def __init__(
164 |         self,
165 |         f   # type: io.BufferedReader
166 |     ):
167 |         self.f = f
168 |         self.total_size = os.fstat(f.fileno()).st_size # type: int
169 |         self.pbar = tqdm(total=self.total_size, leave=False)
170 |         if six.PY3:
171 |             # does not work unless PY3
172 |             # no big deal as the CLI does not currently support PY2 anyways.
173 |             self.read = f.read
174 |             f.read = self._read
175 | 
176 |     def _read(self, n=-1):
177 |         self.pbar.update(n)
178 |         return self.read(n)
179 | 
180 |     def close(self):
181 |         self.pbar.close()
182 | 
183 | 
184 | 
185 | class HfFolder:
186 |     path_token = expanduser("~/.huggingface/token")
187 | 
188 |     @classmethod
189 |     def save_token(cls, token):
190 |         """
191 |         Save token, creating folder as needed.
192 |         """
193 |         if six.PY3:
194 |             os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
195 |         else:
196 |             # Python 2
197 |             try:
198 |                 os.makedirs(os.path.dirname(cls.path_token))
199 |             except OSError as e:
200 |                 if e.errno != os.errno.EEXIST:
201 |                     raise e
202 |                 pass
203 |         with open(cls.path_token, 'w+') as f:
204 |             f.write(token)
205 | 
206 |     @classmethod
207 |     def get_token(cls):
208 |         """
209 |         Get token or None if not existent.
210 |         """
211 |         try:
212 |             with open(cls.path_token, 'r') as f:
213 |                 return f.read()
214 |         except:
215 |             # this is too wide. When Py2 is dead use:
216 |             # `except FileNotFoundError:` instead
217 |             return None
218 | 
219 |     @classmethod
220 |     def delete_token(cls):
221 |         """
222 |         Delete token.
223 |         Do not fail if token does not exist.
224 |         """
225 |         try:
226 |             os.remove(cls.path_token)
227 |         except:
228 |             return
229 | 


--------------------------------------------------------------------------------
/transformers/configuration_bert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ BERT model configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
 31 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
 32 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
 33 |     'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
 34 |     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
 35 |     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
 36 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
 37 |     'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
 38 |     'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
 39 |     'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
 40 |     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
 41 |     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
 42 |     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
 43 |     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
 44 |     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
 45 |     'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
 46 |     'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
 47 |     'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
 48 |     'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json"
 49 | }
 50 | 
 51 | 
 52 | class BertConfig(PretrainedConfig):
 53 |     r"""
 54 |         :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
 55 |         `BertModel`.
 56 | 
 57 | 
 58 |         Arguments:
 59 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
 60 |             hidden_size: Size of the encoder layers and the pooler layer.
 61 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
 62 |             num_attention_heads: Number of attention heads for each attention layer in
 63 |                 the Transformer encoder.
 64 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 65 |                 layer in the Transformer encoder.
 66 |             hidden_act: The non-linear activation function (function or string) in the
 67 |                 encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
 68 |             hidden_dropout_prob: The dropout probabilitiy for all fully connected
 69 |                 layers in the embeddings, encoder, and pooler.
 70 |             attention_probs_dropout_prob: The dropout ratio for the attention
 71 |                 probabilities.
 72 |             max_position_embeddings: The maximum sequence length that this model might
 73 |                 ever be used with. Typically set this to something large just in case
 74 |                 (e.g., 512 or 1024 or 2048).
 75 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 76 |                 `BertModel`.
 77 |             initializer_range: The sttdev of the truncated_normal_initializer for
 78 |                 initializing all weight matrices.
 79 |             layer_norm_eps: The epsilon used by LayerNorm.
 80 |     """
 81 |     pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 82 | 
 83 |     def __init__(self,
 84 |                  vocab_size_or_config_json_file=30522,
 85 |                  hidden_size=768,
 86 |                  num_hidden_layers=12,
 87 |                  num_attention_heads=12,
 88 |                  intermediate_size=3072,
 89 |                  hidden_act="gelu",
 90 |                  hidden_dropout_prob=0.1,
 91 |                  attention_probs_dropout_prob=0.1,
 92 |                  max_position_embeddings=512,
 93 |                  type_vocab_size=2,
 94 |                  initializer_range=0.02,
 95 |                  layer_norm_eps=1e-12,
 96 |                  **kwargs):
 97 |         super(BertConfig, self).__init__(**kwargs)
 98 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
 99 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
100 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
101 |                 json_config = json.loads(reader.read())
102 |             for key, value in json_config.items():
103 |                 self.__dict__[key] = value
104 |         elif isinstance(vocab_size_or_config_json_file, int):
105 |             self.vocab_size = vocab_size_or_config_json_file
106 |             self.hidden_size = hidden_size
107 |             self.num_hidden_layers = num_hidden_layers
108 |             self.num_attention_heads = num_attention_heads
109 |             self.hidden_act = hidden_act
110 |             self.intermediate_size = intermediate_size
111 |             self.hidden_dropout_prob = hidden_dropout_prob
112 |             self.attention_probs_dropout_prob = attention_probs_dropout_prob
113 |             self.max_position_embeddings = max_position_embeddings
114 |             self.type_vocab_size = type_vocab_size
115 |             self.initializer_range = initializer_range
116 |             self.layer_norm_eps = layer_norm_eps
117 |         else:
118 |             raise ValueError("First argument must be either a vocabulary size (int)"
119 |                              " or the path to a pretrained model config file (str)")
120 | 


--------------------------------------------------------------------------------
/transformers/tokenization_roberta.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes for RoBERTa."""
 16 | from __future__ import (absolute_import, division, print_function,
 17 |                         unicode_literals)
 18 | 
 19 | import sys
 20 | import json
 21 | import logging
 22 | import os
 23 | import regex as re
 24 | from io import open
 25 | 
 26 | from .tokenization_gpt2 import GPT2Tokenizer
 27 | 
 28 | try:
 29 |     from functools import lru_cache
 30 | except ImportError:
 31 |     # Just a dummy decorator to get the checks to run on python2
 32 |     # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
 33 |     def lru_cache():
 34 |         return lambda func: func
 35 | 
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | VOCAB_FILES_NAMES = {
 39 |     'vocab_file': 'vocab.json',
 40 |     'merges_file': 'merges.txt',
 41 | }
 42 | 
 43 | PRETRAINED_VOCAB_FILES_MAP = {
 44 |     'vocab_file':
 45 |     {
 46 |         'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
 47 |         'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
 48 |         'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
 49 |         'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json",
 50 |         'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
 51 |         'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
 52 |     },
 53 |     'merges_file':
 54 |     {
 55 |         'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
 56 |         'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
 57 |         'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
 58 |         'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt",
 59 |         'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
 60 |         'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
 61 |     },
 62 | }
 63 | 
 64 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 65 |     'roberta-base': 512,
 66 |     'roberta-large': 512,
 67 |     'roberta-large-mnli': 512,
 68 |     'distilroberta-base': 512,
 69 |     'roberta-base-openai-detector': 512,
 70 |     'roberta-large-openai-detector': 512,
 71 | }
 72 | 
 73 | 
 74 | class RobertaTokenizer(GPT2Tokenizer):
 75 |     """
 76 |     RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
 77 |         - Byte-level Byte-Pair-Encoding
 78 |         - Requires a space to start the input string => the encoding methods should be called with the
 79 |           ``add_prefix_space`` flag set to ``True``.
 80 |           Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
 81 |           the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
 82 |     """
 83 |     vocab_files_names = VOCAB_FILES_NAMES
 84 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
 85 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 86 | 
 87 |     def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
 88 |                  cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
 89 |         super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors,
 90 |                                                bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
 91 |                                                sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
 92 |                                                mask_token=mask_token, **kwargs)
 93 |         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
 94 |         self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
 95 | 
 96 |     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
 97 |         """
 98 |         Build model inputs from a sequence or a pair of sequence for sequence classification tasks
 99 |         by concatenating and adding special tokens.
100 |         A RoBERTa sequence has the following format:
101 |             single sequence: <s> X </s>
102 |             pair of sequences: <s> A </s></s> B </s>
103 |         """
104 |         if token_ids_1 is None:
105 |             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
106 |         cls = [self.cls_token_id]
107 |         sep = [self.sep_token_id]
108 |         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
109 | 
110 |     def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
111 |         """
112 |         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
113 |         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
114 | 
115 |         Args:
116 |             token_ids_0: list of ids (must not contain special tokens)
117 |             token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
118 |                 for sequence pairs
119 |             already_has_special_tokens: (default False) Set to True if the token list is already formated with
120 |                 special tokens for the model
121 | 
122 |         Returns:
123 |             A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
124 |         """
125 |         if already_has_special_tokens:
126 |             if token_ids_1 is not None:
127 |                 raise ValueError("You should not supply a second sequence if the provided sequence of "
128 |                                  "ids is already formated with special tokens for the model.")
129 |             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
130 | 
131 |         if token_ids_1 is None:
132 |             return [1] + ([0] * len(token_ids_0)) + [1]
133 |         return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
134 | 
135 |     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
136 |         """
137 |         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
138 |         A RoBERTa sequence pair mask has the following format:
139 |         0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
140 |         | first sequence    | second sequence
141 | 
142 |         if token_ids_1 is None, only returns the first portion of the mask (0's).
143 |         """
144 |         sep = [self.sep_token_id]
145 |         cls = [self.cls_token_id]
146 | 
147 |         if token_ids_1 is None:
148 |             return len(cls + token_ids_0 + sep) * [0]
149 |         return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
150 | 


--------------------------------------------------------------------------------
/transformers/tokenization_camembert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License
 15 | """ Tokenization classes for Camembert model."""
 16 | from __future__ import (absolute_import, division, print_function,
 17 |                         unicode_literals)
 18 | 
 19 | import logging
 20 | import os
 21 | from shutil import copyfile
 22 | 
 23 | import sentencepiece as spm
 24 | from transformers.tokenization_utils import PreTrainedTokenizer
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
 29 | 
 30 | PRETRAINED_VOCAB_FILES_MAP = {
 31 |     'vocab_file':
 32 |     {
 33 |     'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model",
 34 |     }
 35 | }
 36 | 
 37 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 38 |     'camembert-base': None,
 39 | }
 40 | 
 41 | class CamembertTokenizer(PreTrainedTokenizer):
 42 |     """
 43 |         Adapted from RobertaTokenizer and XLNetTokenizer
 44 |         SentencePiece based tokenizer. Peculiarities:
 45 | 
 46 |             - requires `SentencePiece <https://github.com/google/sentencepiece>`_
 47 |     """
 48 |     vocab_files_names = VOCAB_FILES_NAMES
 49 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
 50 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 51 | 
 52 |     def __init__(self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>",
 53 |                  cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>',
 54 |                  additional_special_tokens=['<s>NOTUSED', '</s>NOTUSED'], **kwargs):
 55 |         super(CamembertTokenizer, self).__init__(max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
 56 |                                                  sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
 57 |                                                  mask_token=mask_token, additional_special_tokens=additional_special_tokens,
 58 |                                                  **kwargs)
 59 |         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
 60 |         self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
 61 |         self.sp_model = spm.SentencePieceProcessor()
 62 |         self.sp_model.Load(str(vocab_file))
 63 |         self.vocab_file = vocab_file
 64 |         # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
 65 |         # sentencepiece vocabulary (this is the case for <s> and </s>
 66 |         self.fairseq_tokens_to_ids = {'<s>NOTUSED': 0, '<pad>': 1, '</s>NOTUSED': 2, '<unk>': 3}
 67 |         self.fairseq_offset = len(self.fairseq_tokens_to_ids)
 68 |         self.fairseq_tokens_to_ids['<mask>'] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
 69 |         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
 70 | 
 71 |     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
 72 |         """
 73 |         Build model inputs from a sequence or a pair of sequence for sequence classification tasks
 74 |         by concatenating and adding special tokens.
 75 |         A RoBERTa sequence has the following format:
 76 |             single sequence: <s> X </s>
 77 |             pair of sequences: <s> A </s></s> B </s>
 78 |         """
 79 |         if token_ids_1 is None:
 80 |             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 81 |         cls = [self.cls_token_id]
 82 |         sep = [self.sep_token_id]
 83 |         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 84 | 
 85 |     def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
 86 |         """
 87 |         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
 88 |         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
 89 | 
 90 |         Args:
 91 |             token_ids_0: list of ids (must not contain special tokens)
 92 |             token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
 93 |                 for sequence pairs
 94 |             already_has_special_tokens: (default False) Set to True if the token list is already formated with
 95 |                 special tokens for the model
 96 | 
 97 |         Returns:
 98 |             A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
 99 |         """
100 |         if already_has_special_tokens:
101 |             if token_ids_1 is not None:
102 |                 raise ValueError("You should not supply a second sequence if the provided sequence of "
103 |                                  "ids is already formated with special tokens for the model.")
104 |             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
105 | 
106 |         if token_ids_1 is None:
107 |             return [1] + ([0] * len(token_ids_0)) + [1]
108 |         return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
109 | 
110 |     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
111 |         """
112 |         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
113 |         A RoBERTa sequence pair mask has the following format:
114 |         0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
115 |         | first sequence    | second sequence
116 | 
117 |         if token_ids_1 is None, only returns the first portion of the mask (0's).
118 |         """
119 |         sep = [self.sep_token_id]
120 |         cls = [self.cls_token_id]
121 | 
122 |         if token_ids_1 is None:
123 |             return len(cls + token_ids_0 + sep) * [0]
124 |         return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
125 | 
126 |     @property
127 |     def vocab_size(self):
128 |         return len(self.fairseq_tokens_to_ids) + len(self.sp_model)
129 | 
130 |     def _tokenize(self, text):
131 |         return self.sp_model.EncodeAsPieces(text)
132 | 
133 |     def _convert_token_to_id(self, token):
134 |         """ Converts a token (str/unicode) in an id using the vocab. """
135 |         if token in self.fairseq_tokens_to_ids:
136 |             return self.fairseq_tokens_to_ids[token]
137 |         elif self.sp_model.PieceToId(token) == 0:
138 |             # Convert sentence piece unk token to fairseq unk token index
139 |             return self.unk_token_id
140 |         return self.fairseq_offset + self.sp_model.PieceToId(token)
141 | 
142 |     def _convert_id_to_token(self, index):
143 |         """Converts an index (integer) in a token (string/unicode) using the vocab."""
144 |         if index in self.fairseq_ids_to_tokens:
145 |             return self.fairseq_ids_to_tokens[index]
146 |         return self.sp_model.IdToPiece(index - self.fairseq_offset)
147 | 
148 |     def save_vocabulary(self, save_directory):
149 |         """ Save the sentencepiece vocabulary (copy original file) and special tokens file
150 |             to a directory.
151 |         """
152 |         if not os.path.isdir(save_directory):
153 |             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
154 |             return
155 |         out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
156 | 
157 |         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
158 |             copyfile(self.vocab_file, out_vocab_file)
159 | 
160 |         return (out_vocab_file,)
161 | 


--------------------------------------------------------------------------------
/transformers/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """PyTorch optimization for BERT model."""
 16 | 
 17 | import logging
 18 | import math
 19 | 
 20 | import torch
 21 | from torch.optim import Optimizer
 22 | from torch.optim.lr_scheduler import LambdaLR
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | def get_constant_schedule(optimizer, last_epoch=-1):
 28 |     """ Create a schedule with a constant learning rate.
 29 |     """
 30 |     return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
 31 | 
 32 | 
 33 | def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1):
 34 |     """ Create a schedule with a constant learning rate preceded by a warmup
 35 |     period during which the learning rate increases linearly between 0 and 1.
 36 |     """
 37 |     def lr_lambda(current_step):
 38 |         if current_step < num_warmup_steps:
 39 |             return float(current_step) / float(max(1.0, num_warmup_steps))
 40 |         return 1.
 41 | 
 42 |     return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
 43 | 
 44 | 
 45 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
 46 |     """ Create a schedule with a learning rate that decreases linearly after
 47 |     linearly increasing during a warmup period.
 48 |     """
 49 |     def lr_lambda(current_step):
 50 |         if current_step < num_warmup_steps:
 51 |             return float(current_step) / float(max(1, num_warmup_steps))
 52 |         return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
 53 | 
 54 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
 55 | 
 56 | 
 57 | def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1):
 58 |     """ Create a schedule with a learning rate that decreases following the
 59 |     values of the cosine function between 0 and `pi * cycles` after a warmup
 60 |     period during which it increases linearly between 0 and 1.
 61 |     """
 62 |     def lr_lambda(current_step):
 63 |         if current_step < num_warmup_steps:
 64 |             return float(current_step) / float(max(1, num_warmup_steps))
 65 |         progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
 66 |         return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress)))
 67 | 
 68 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
 69 | 
 70 | 
 71 | def get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=1., last_epoch=-1):
 72 |     """ Create a schedule with a learning rate that decreases following the
 73 |     values of the cosine function with several hard restarts, after a warmup
 74 |     period during which it increases linearly between 0 and 1.
 75 |     """
 76 |     def lr_lambda(current_step):
 77 |         if current_step < num_warmup_steps:
 78 |             return float(current_step) / float(max(1, num_warmup_steps))
 79 |         progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
 80 |         if progress >= 1.:
 81 |             return 0.
 82 |         return max(0., 0.5 * (1. + math.cos(math.pi * ((float(num_cycles) * progress) % 1.))))
 83 | 
 84 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
 85 | 
 86 | 
 87 | class AdamW(Optimizer):
 88 |     """ Implements Adam algorithm with weight decay fix.
 89 | 
 90 |     Parameters:
 91 |         lr (float): learning rate. Default 1e-3.
 92 |         betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999)
 93 |         eps (float): Adams epsilon. Default: 1e-6
 94 |         weight_decay (float): Weight decay. Default: 0.0
 95 |         correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
 96 |     """
 97 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
 98 |         if lr < 0.0:
 99 |             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
100 |         if not 0.0 <= betas[0] < 1.0:
101 |             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
102 |         if not 0.0 <= betas[1]  < 1.0:
103 |             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
104 |         if not 0.0 <= eps:
105 |             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
106 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
107 |                         correct_bias=correct_bias)
108 |         super(AdamW, self).__init__(params, defaults)
109 | 
110 |     def step(self, closure=None):
111 |         """Performs a single optimization step.
112 | 
113 |         Arguments:
114 |             closure (callable, optional): A closure that reevaluates the model
115 |                 and returns the loss.
116 |         """
117 |         loss = None
118 |         if closure is not None:
119 |             loss = closure()
120 | 
121 |         for group in self.param_groups:
122 |             for p in group['params']:
123 |                 if p.grad is None:
124 |                     continue
125 |                 grad = p.grad.data
126 |                 if grad.is_sparse:
127 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
128 | 
129 |                 state = self.state[p]
130 | 
131 |                 # State initialization
132 |                 if len(state) == 0:
133 |                     state['step'] = 0
134 |                     # Exponential moving average of gradient values
135 |                     state['exp_avg'] = torch.zeros_like(p.data)
136 |                     # Exponential moving average of squared gradient values
137 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
138 | 
139 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
140 |                 beta1, beta2 = group['betas']
141 | 
142 |                 state['step'] += 1
143 | 
144 |                 # Decay the first and second moment running average coefficient
145 |                 # In-place operations to update the averages at the same time
146 |                 exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
147 |                 exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
148 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
149 | 
150 |                 step_size = group['lr']
151 |                 if group['correct_bias']:  # No bias correction for Bert
152 |                     bias_correction1 = 1.0 - beta1 ** state['step']
153 |                     bias_correction2 = 1.0 - beta2 ** state['step']
154 |                     step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
155 | 
156 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
157 | 
158 |                 # Just adding the square of the weights to the loss function is *not*
159 |                 # the correct way of using L2 regularization/weight decay with Adam,
160 |                 # since that will interact with the m and v parameters in strange ways.
161 |                 #
162 |                 # Instead we want to decay the weights in a manner that doesn't interact
163 |                 # with the m/v parameters. This is equivalent to adding the square
164 |                 # of the weights to the loss with plain (non-momentum) SGD.
165 |                 # Add weight decay at the end (fixed version)
166 |                 if group['weight_decay'] > 0.0:
167 |                     p.data.add_(-group['lr'] * group['weight_decay'], p.data)
168 | 
169 |         return loss
170 | 


--------------------------------------------------------------------------------
/transformers/tokenization_openai.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes for OpenAI GPT."""
 16 | from __future__ import (absolute_import, division, print_function,
 17 |                         unicode_literals)
 18 | 
 19 | import json
 20 | import logging
 21 | import os
 22 | import re
 23 | from io import open
 24 | 
 25 | from .tokenization_utils import PreTrainedTokenizer
 26 | from .tokenization_bert import BasicTokenizer
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | VOCAB_FILES_NAMES = {
 31 |     'vocab_file': 'vocab.json',
 32 |     'merges_file': 'merges.txt',
 33 | }
 34 | 
 35 | PRETRAINED_VOCAB_FILES_MAP = {
 36 |     'vocab_file':
 37 |     {
 38 |         'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
 39 |     },
 40 |     'merges_file':
 41 |     {
 42 |         'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
 43 |     },
 44 | }
 45 | 
 46 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 47 |     'openai-gpt': 512,
 48 | }
 49 | 
 50 | def get_pairs(word):
 51 |     """
 52 |     Return set of symbol pairs in a word.
 53 |     word is represented as tuple of symbols (symbols being variable-length strings)
 54 |     """
 55 |     pairs = set()
 56 |     prev_char = word[0]
 57 |     for char in word[1:]:
 58 |         pairs.add((prev_char, char))
 59 |         prev_char = char
 60 |     return pairs
 61 | 
 62 | def text_standardize(text):
 63 |     """
 64 |     fixes some issues the spacy tokenizer had on books corpus
 65 |     also does some whitespace standardization
 66 |     """
 67 |     text = text.replace('—', '-')
 68 |     text = text.replace('–', '-')
 69 |     text = text.replace('―', '-')
 70 |     text = text.replace('…', '...')
 71 |     text = text.replace('´', "'")
 72 |     text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
 73 |     text = re.sub(r'\s*\n\s*', ' \n ', text)
 74 |     text = re.sub(r'[^\S\n]+', ' ', text)
 75 |     return text.strip()
 76 | 
 77 | class OpenAIGPTTokenizer(PreTrainedTokenizer):
 78 |     """
 79 |     BPE tokenizer. Peculiarities:
 80 |         - lower case all inputs
 81 |         - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
 82 |     """
 83 |     vocab_files_names = VOCAB_FILES_NAMES
 84 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
 85 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 86 | 
 87 |     def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
 88 |         super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
 89 | 
 90 |         self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
 91 |         self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
 92 | 
 93 |         try:
 94 |             import ftfy
 95 |             from spacy.lang.en import English
 96 |             _nlp = English()
 97 |             self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
 98 |             self.fix_text = ftfy.fix_text
 99 |         except ImportError:
100 |             logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
101 |             self.nlp = BasicTokenizer(do_lower_case=True)
102 |             self.fix_text = None
103 | 
104 |         with open(vocab_file, encoding="utf-8") as vocab_handle:
105 |             self.encoder = json.load(vocab_handle)
106 |         self.decoder = {v:k for k,v in self.encoder.items()}
107 |         with open(merges_file, encoding='utf-8') as merges_handle:
108 |             merges = merges_handle.read().split('\n')[1:-1]
109 |         merges = [tuple(merge.split()) for merge in merges]
110 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
111 |         self.cache = {}
112 | 
113 |     @property
114 |     def vocab_size(self):
115 |         return len(self.encoder)
116 | 
117 |     def bpe(self, token):
118 |         word = tuple(token[:-1]) + (token[-1] + '</w>',)
119 |         if token in self.cache:
120 |             return self.cache[token]
121 |         pairs = get_pairs(word)
122 | 
123 |         if not pairs:
124 |             return token+'</w>'
125 | 
126 |         while True:
127 |             bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
128 |             if bigram not in self.bpe_ranks:
129 |                 break
130 |             first, second = bigram
131 |             new_word = []
132 |             i = 0
133 |             while i < len(word):
134 |                 try:
135 |                     j = word.index(first, i)
136 |                     new_word.extend(word[i:j])
137 |                     i = j
138 |                 except:
139 |                     new_word.extend(word[i:])
140 |                     break
141 | 
142 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
143 |                     new_word.append(first+second)
144 |                     i += 2
145 |                 else:
146 |                     new_word.append(word[i])
147 |                     i += 1
148 |             new_word = tuple(new_word)
149 |             word = new_word
150 |             if len(word) == 1:
151 |                 break
152 |             else:
153 |                 pairs = get_pairs(word)
154 |         word = ' '.join(word)
155 |         if word == '\n  </w>':
156 |             word = '\n</w>'
157 |         self.cache[token] = word
158 |         return word
159 | 
160 |     def _tokenize(self, text):
161 |         """ Tokenize a string. """
162 |         split_tokens = []
163 |         if self.fix_text is None:
164 |             # Using BERT's BasicTokenizer
165 |             text = self.nlp.tokenize(text)
166 |             for token in text:
167 |                 split_tokens.extend([t for t in self.bpe(token).split(' ')])
168 |         else:
169 |             # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
170 |             text = self.nlp(text_standardize(self.fix_text(text)))
171 |             for token in text:
172 |                 split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
173 |         return split_tokens
174 | 
175 |     def _convert_token_to_id(self, token):
176 |         """ Converts a token (str/unicode) in an id using the vocab. """
177 |         return self.encoder.get(token, self.encoder.get(self.unk_token))
178 | 
179 |     def _convert_id_to_token(self, index):
180 |         """Converts an id in a token (BPE) using the vocab."""
181 |         return self.decoder.get(index, self.unk_token)
182 | 
183 |     def convert_tokens_to_string(self, tokens):
184 |         """ Converts a sequence of tokens (string) in a single string. """
185 |         out_string = ''.join(tokens).replace('</w>', ' ').strip()
186 |         return out_string
187 | 
188 |     def save_vocabulary(self, save_directory):
189 |         """Save the tokenizer vocabulary and merge files to a directory."""
190 |         if not os.path.isdir(save_directory):
191 |             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
192 |             return
193 |         vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
194 |         merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
195 | 
196 |         with open(vocab_file, 'w', encoding='utf-8') as f:
197 |             f.write(json.dumps(self.encoder, ensure_ascii=False))
198 | 
199 |         index = 0
200 |         with open(merge_file, "w", encoding="utf-8") as writer:
201 |             writer.write(u'#version: 0.2\n')
202 |             for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
203 |                 if index != token_index:
204 |                     logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
205 |                                    " Please check that the tokenizer is not corrupted!".format(merge_file))
206 |                     index = token_index
207 |                 writer.write(' '.join(bpe_tokens) + u'\n')
208 |                 index += 1
209 | 
210 |         return vocab_file, merge_file
211 | 


--------------------------------------------------------------------------------
/transformers/configuration_xlm.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ XLM configuration """
 16 | from __future__ import absolute_import, division, print_function, unicode_literals
 17 | 
 18 | import json
 19 | import logging
 20 | import sys
 21 | from io import open
 22 | 
 23 | from .configuration_utils import PretrainedConfig
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 28 |     'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
 29 |     'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
 30 |     'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json",
 31 |     'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
 32 |     'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
 33 |     'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
 34 |     'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
 35 |     'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
 36 |     'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
 37 |     'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json",
 38 | }
 39 | 
 40 | 
 41 | class XLMConfig(PretrainedConfig):
 42 |     """Configuration class to store the configuration of a `XLMModel`.
 43 | 
 44 |     Args:
 45 |         vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
 46 |         d_model: Size of the encoder layers and the pooler layer.
 47 |         n_layer: Number of hidden layers in the Transformer encoder.
 48 |         n_head: Number of attention heads for each attention layer in
 49 |             the Transformer encoder.
 50 |         d_inner: The size of the "intermediate" (i.e., feed-forward)
 51 |             layer in the Transformer encoder.
 52 |         ff_activation: The non-linear activation function (function or string) in the
 53 |             encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 54 |         untie_r: untie relative position biases
 55 |         attn_type: 'bi' for XLM, 'uni' for Transformer-XL
 56 | 
 57 |         dropout: The dropout probabilitiy for all fully connected
 58 |             layers in the embeddings, encoder, and pooler.
 59 |         max_position_embeddings: The maximum sequence length that this model might
 60 |             ever be used with. Typically set this to something large just in case
 61 |             (e.g., 512 or 1024 or 2048).
 62 |         initializer_range: The sttdev of the truncated_normal_initializer for
 63 |             initializing all weight matrices.
 64 |         layer_norm_eps: The epsilon used by LayerNorm.
 65 | 
 66 |         dropout: float, dropout rate.
 67 |         init: str, the initialization scheme, either "normal" or "uniform".
 68 |         init_range: float, initialize the parameters with a uniform distribution
 69 |             in [-init_range, init_range]. Only effective when init="uniform".
 70 |         init_std: float, initialize the parameters with a normal distribution
 71 |             with mean 0 and stddev init_std. Only effective when init="normal".
 72 |         mem_len: int, the number of tokens to cache.
 73 |         reuse_len: int, the number of tokens in the currect batch to be cached
 74 |             and reused in the future.
 75 |         bi_data: bool, whether to use bidirectional input pipeline.
 76 |             Usually set to True during pretraining and False during finetuning.
 77 |         clamp_len: int, clamp all relative distances larger than clamp_len.
 78 |             -1 means no clamping.
 79 |         same_length: bool, whether to use the same attention length for each token.
 80 |     """
 81 |     pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 82 | 
 83 |     def __init__(self,
 84 |                  vocab_size_or_config_json_file=30145,
 85 |                  emb_dim=2048,
 86 |                  n_layers=12,
 87 |                  n_heads=16,
 88 |                  dropout=0.1,
 89 |                  attention_dropout=0.1,
 90 |                  gelu_activation=True,
 91 |                  sinusoidal_embeddings=False,
 92 |                  causal=False,
 93 |                  asm=False,
 94 |                  n_langs=1,
 95 |                  use_lang_emb=True,
 96 |                  max_position_embeddings=512,
 97 |                  embed_init_std=2048 ** -0.5,
 98 |                  layer_norm_eps=1e-12,
 99 |                  init_std=0.02,
100 |                  bos_index=0,
101 |                  eos_index=1,
102 |                  pad_index=2,
103 |                  unk_index=3,
104 |                  mask_index=5,
105 |                  is_encoder=True,
106 | 
107 |                  finetuning_task=None,
108 |                  num_labels=2,
109 |                  summary_type='first',
110 |                  summary_use_proj=True,
111 |                  summary_activation=None,
112 |                  summary_proj_to_labels=True,
113 |                  summary_first_dropout=0.1,
114 |                  start_n_top=5,
115 |                  end_n_top=5,
116 |                  **kwargs):
117 |         """Constructs XLMConfig.
118 |         """
119 |         super(XLMConfig, self).__init__(**kwargs)
120 | 
121 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
122 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
123 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
124 |                 json_config = json.loads(reader.read())
125 |             for key, value in json_config.items():
126 |                 self.__dict__[key] = value
127 |         elif isinstance(vocab_size_or_config_json_file, int):
128 |             self.n_words = vocab_size_or_config_json_file
129 |             self.emb_dim = emb_dim
130 |             self.n_layers = n_layers
131 |             self.n_heads = n_heads
132 |             self.dropout = dropout
133 |             self.attention_dropout = attention_dropout
134 |             self.gelu_activation = gelu_activation
135 |             self.sinusoidal_embeddings = sinusoidal_embeddings
136 |             self.causal = causal
137 |             self.asm = asm
138 |             self.n_langs = n_langs
139 |             self.use_lang_emb = use_lang_emb
140 |             self.layer_norm_eps = layer_norm_eps
141 |             self.bos_index = bos_index
142 |             self.eos_index = eos_index
143 |             self.pad_index = pad_index
144 |             self.unk_index = unk_index
145 |             self.mask_index = mask_index
146 |             self.is_encoder = is_encoder
147 |             self.max_position_embeddings = max_position_embeddings
148 |             self.embed_init_std = embed_init_std
149 |             self.init_std = init_std
150 |             self.finetuning_task = finetuning_task
151 |             self.num_labels = num_labels
152 |             self.summary_type = summary_type
153 |             self.summary_use_proj = summary_use_proj
154 |             self.summary_activation = summary_activation
155 |             self.summary_proj_to_labels = summary_proj_to_labels
156 |             self.summary_first_dropout = summary_first_dropout
157 |             self.start_n_top = start_n_top
158 |             self.end_n_top = end_n_top
159 |         else:
160 |             raise ValueError("First argument must be either a vocabulary size (int)"
161 |                              " or the path to a pretrained model config file (str)")
162 | 
163 |     @property
164 |     def vocab_size(self):
165 |         return self.n_words
166 | 
167 |     @vocab_size.setter
168 |     def vocab_size(self, value):
169 |         self.n_words = value
170 | 
171 |     @property
172 |     def hidden_size(self):
173 |         return self.emb_dim
174 | 
175 |     @property
176 |     def num_attention_heads(self):
177 |         return self.n_heads
178 | 
179 |     @property
180 |     def num_hidden_layers(self):
181 |         return self.n_layers
182 | 


--------------------------------------------------------------------------------
/transformers/modeling_tf_transfo_xl_utilities.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ A TF 2.0 Adaptive Softmax for Transformer XL model.
 17 | """
 18 | 
 19 | from collections import defaultdict
 20 | 
 21 | import numpy as np
 22 | 
 23 | import tensorflow as tf
 24 | 
 25 | from .modeling_tf_utils import shape_list
 26 | 
 27 | class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
 28 |     def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
 29 |                  keep_order=False, **kwargs):
 30 |         super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
 31 | 
 32 |         self.n_token = n_token
 33 |         self.d_embed = d_embed
 34 |         self.d_proj = d_proj
 35 | 
 36 |         self.cutoffs = cutoffs + [n_token]
 37 |         self.cutoff_ends = [0] + self.cutoffs
 38 |         self.div_val = div_val
 39 | 
 40 |         self.shortlist_size = self.cutoffs[0]
 41 |         self.n_clusters = len(self.cutoffs) - 1
 42 |         self.head_size = self.shortlist_size + self.n_clusters
 43 |         self.keep_order = keep_order
 44 | 
 45 |         self.out_layers = []
 46 |         self.out_projs = []
 47 | 
 48 |     def build(self, input_shape):
 49 |         if self.n_clusters > 0:
 50 |             self.cluster_weight = self.add_weight(shape=(self.n_clusters, self.d_embed),
 51 |                                                   initializer='zeros',
 52 |                                                   trainable=True,
 53 |                                                   name='cluster_weight')
 54 |             self.cluster_bias = self.add_weight(shape=(self.n_clusters,),
 55 |                                                 initializer='zeros',
 56 |                                                 trainable=True,
 57 |                                                 name='cluster_bias')
 58 | 
 59 |         if self.div_val == 1:
 60 |             for i in range(len(self.cutoffs)):
 61 |                 if self.d_proj != self.d_embed:
 62 |                     weight = self.add_weight(shape=(self.d_embed, self.d_proj),
 63 |                                              initializer='zeros',
 64 |                                              trainable=True,
 65 |                                              name='out_projs_._{}'.format(i))
 66 |                     self.out_projs.append(weight)
 67 |                 else:
 68 |                     self.out_projs.append(None)
 69 |                 weight = self.add_weight(shape=(self.n_token, self.d_embed,),
 70 |                                          initializer='zeros',
 71 |                                          trainable=True,
 72 |                                          name='out_layers_._{}_._weight'.format(i))
 73 |                 bias = self.add_weight(shape=(self.n_token,),
 74 |                                          initializer='zeros',
 75 |                                          trainable=True,
 76 |                                          name='out_layers_._{}_._bias'.format(i))
 77 |                 self.out_layers.append((weight, bias))
 78 |         else:
 79 |             for i in range(len(self.cutoffs)):
 80 |                 l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
 81 |                 d_emb_i = self.d_embed // (self.div_val ** i)
 82 | 
 83 |                 weight = self.add_weight(shape=(d_emb_i, self.d_proj),
 84 |                                          initializer='zeros',
 85 |                                          trainable=True,
 86 |                                          name='out_projs_._{}'.format(i))
 87 |                 self.out_projs.append(weight)
 88 |                 weight = self.add_weight(shape=(r_idx-l_idx, d_emb_i,),
 89 |                                          initializer='zeros',
 90 |                                          trainable=True,
 91 |                                          name='out_layers_._{}_._weight'.format(i))
 92 |                 bias = self.add_weight(shape=(r_idx-l_idx,),
 93 |                                          initializer='zeros',
 94 |                                          trainable=True,
 95 |                                          name='out_layers_._{}_._bias'.format(i))
 96 |                 self.out_layers.append((weight, bias))
 97 |         super(TFAdaptiveSoftmaxMask, self).build(input_shape)
 98 | 
 99 |     @staticmethod
100 |     def _logit(x, W, b, proj=None):
101 |         y = x
102 |         if proj is not None:
103 |             y = tf.einsum('ibd,ed->ibe', y, proj)
104 |         return tf.einsum('ibd,nd->ibn', y, W) + b
105 | 
106 |     @staticmethod
107 |     def _gather_logprob(logprob, target):
108 |         lp_size = shape_list(logprob)
109 |         r = tf.range(lp_size[0])
110 |         idx = tf.stack([r, target], 1)
111 |         return tf.gather_nd(logprob, idx)
112 | 
113 |     def call(self, inputs, return_mean=True, training=False):
114 |         hidden, target = inputs
115 |         head_logprob = 0
116 |         if self.n_clusters == 0:
117 |             softmax_b = tf.get_variable('bias', [n_token], initializer=tf.zeros_initializer())
118 |             output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
119 |             if target is not None:
120 |                 loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
121 |             out = tf.nn.log_softmax(output, axis=-1)
122 |         else:
123 |             hidden_sizes = shape_list(hidden)
124 |             out = []
125 |             loss = tf.zeros(hidden_sizes[:2], dtype=tf.float32)
126 |             for i in range(len(self.cutoffs)):
127 |                 l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
128 |                 if target is not None:
129 |                     mask = (target >= l_idx) & (target < r_idx)
130 |                     mask_idx = tf.where(mask)
131 |                     cur_target = tf.boolean_mask(target, mask) - l_idx
132 | 
133 |                 if self.div_val == 1:
134 |                     cur_W = self.out_layers[0][0][l_idx:r_idx]
135 |                     cur_b = self.out_layers[0][1][l_idx:r_idx]
136 |                 else:
137 |                     cur_W = self.out_layers[i][0]
138 |                     cur_b = self.out_layers[i][1]
139 | 
140 |                 if i == 0:
141 |                     cur_W = tf.concat([cur_W, self.cluster_weight], 0)
142 |                     cur_b = tf.concat([cur_b, self.cluster_bias], 0)
143 | 
144 |                     head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0])
145 |                     head_logprob = tf.nn.log_softmax(head_logit)
146 |                     out.append(head_logprob[..., :self.cutoffs[0]])
147 |                     if target is not None:
148 |                         cur_head_logprob = tf.boolean_mask(head_logprob, mask)
149 |                         cur_logprob = self._gather_logprob(cur_head_logprob, cur_target)
150 |                 else:
151 |                     tail_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[i])
152 |                     tail_logprob = tf.nn.log_softmax(tail_logit)
153 |                     cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
154 |                     logprob_i = head_logprob[..., cluster_prob_idx, None] + tail_logprob
155 |                     out.append(logprob_i)
156 |                     if target is not None:
157 |                         cur_head_logprob = tf.boolean_mask(head_logprob, mask)
158 |                         cur_tail_logprob = tf.boolean_mask(tail_logprob, mask)
159 |                         cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target)
160 |                         cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1]
161 |                 if target is not None:
162 |                     loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(shape_list(loss), dtype=tf.int64))
163 |             out = tf.concat(out, axis=-1)
164 | 
165 |         if target is not None:
166 |             if return_mean:
167 |                 loss = tf.reduce_mean(loss)
168 |             # Add the training-time loss value to the layer using `self.add_loss()`.
169 |             self.add_loss(loss)
170 | 
171 |             # Log the loss as a metric (we could log arbitrary metrics,
172 |             # including different metrics for training and inference.
173 |             self.add_metric(loss, name=self.name, aggregation='mean' if return_mean else '')
174 | 
175 |         return out
176 | 


--------------------------------------------------------------------------------
/transformers/tokenization_ctrl.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Salesforce and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes for Salesforce CTRL."""
 16 | from __future__ import (absolute_import, division, print_function,
 17 |                         unicode_literals)
 18 | 
 19 | import json
 20 | import logging
 21 | import os
 22 | import regex as re
 23 | from io import open
 24 | 
 25 | from .tokenization_utils import PreTrainedTokenizer
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | VOCAB_FILES_NAMES = {
 30 |     'vocab_file': 'vocab.json',
 31 |     'merges_file': 'merges.txt',
 32 | }
 33 | 
 34 | PRETRAINED_VOCAB_FILES_MAP = {
 35 |     'vocab_file':
 36 |     {
 37 |         'ctrl': "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json",
 38 |     },
 39 |     'merges_file':
 40 |     {
 41 |         'ctrl': "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt",
 42 |     },
 43 | }
 44 | 
 45 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 46 |     'ctrl': 256,
 47 | }
 48 | 
 49 | CONTROL_CODES = {
 50 |     "Pregnancy": 168629,
 51 |     "Christianity": 7675,
 52 |     "Explain": 106423,
 53 |     "Fitness": 63440,
 54 |     "Saving": 63163,
 55 |     "Ask": 27171,
 56 |     "Ass": 95985,
 57 |     "Joke": 163509,
 58 |     "Questions": 45622,
 59 |     "Thoughts": 49605,
 60 |     "Retail": 52342,
 61 |     "Feminism": 164338,
 62 |     "Writing": 11992,
 63 |     "Atheism": 192263,
 64 |     "Netflix": 48616,
 65 |     "Computing": 39639,
 66 |     "Opinion": 43213,
 67 |     "Alone": 44967,
 68 |     "Funny": 58917,
 69 |     "Gaming": 40358,
 70 |     "Human": 4088,
 71 |     "India": 1331,
 72 |     "Joker": 77138,
 73 |     "Diet": 36206,
 74 |     "Legal": 11859,
 75 |     "Norman": 4939,
 76 |     "Tip": 72689,
 77 |     "Weight": 52343,
 78 |     "Movies": 46273,
 79 |     "Running": 23425,
 80 |     "Science": 2090,
 81 |     "Horror": 37793,
 82 |     "Confession": 60572,
 83 |     "Finance": 12250,
 84 |     "Politics": 16360,
 85 |     "Scary": 191985,
 86 |     "Support": 12654,
 87 |     "Technologies": 32516,
 88 |     "Teenage": 66160,
 89 |     "Event": 32769,
 90 |     "Learned": 67460,
 91 |     "Notion": 182770,
 92 |     "Wikipedia": 37583,
 93 |     "Books": 6665,
 94 |     "Extract": 76050,
 95 |     "Confessions": 102701,
 96 |     "Conspiracy": 75932,
 97 |     "Links": 63674,
 98 |     "Narcissus": 150425,
 99 |     "Relationship": 54766,
100 |     "Relationships": 134796,
101 |     "Reviews": 41671,
102 |     "News": 4256,
103 |     "Translation": 26820,
104 |     "multilingual": 128406,
105 | }
106 | 
107 | def get_pairs(word):
108 |     """Return set of symbol pairs in a word.
109 | 
110 |     Word is represented as tuple of symbols (symbols being variable-length strings).
111 |     """
112 |     pairs = set()
113 |     prev_char = word[0]
114 |     for char in word[1:]:
115 |         pairs.add((prev_char, char))
116 |         prev_char = char
117 | 
118 |     pairs = set(pairs)
119 |     return pairs
120 | 
121 | class CTRLTokenizer(PreTrainedTokenizer):
122 |     """
123 |     CTRL BPE tokenizer. Peculiarities:
124 |         - Byte-Pair-Encoding
125 |     """
126 |     vocab_files_names = VOCAB_FILES_NAMES
127 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
128 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
129 |     control_codes = CONTROL_CODES
130 | 
131 |     def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
132 |         super(CTRLTokenizer, self).__init__(unk_token=unk_token, **kwargs)
133 |         self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
134 |         self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
135 | 
136 |         with open(vocab_file, encoding="utf-8") as vocab_handle:
137 |             self.encoder = json.load(vocab_handle)
138 |         self.decoder = {v:k for k,v in self.encoder.items()}
139 |         with open(merges_file, encoding='utf-8') as merges_handle:
140 |             merges = merges_handle.read().split('\n')[1:-1]
141 |         merges = [tuple(merge.split()) for merge in merges]
142 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
143 |         self.cache = {}
144 | 
145 |     @property
146 |     def vocab_size(self):
147 |         return len(self.encoder)
148 | 
149 |     def bpe(self, token):
150 |         if token in self.cache:
151 |             return self.cache[token]
152 |         word = tuple(token)
153 |         word = tuple(list(word[:-1]) + [word[-1]+'</w>'])
154 |         pairs = get_pairs(word)
155 | 
156 |         if not pairs:
157 |             return token
158 | 
159 |         while True:
160 |             bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
161 |             if bigram not in self.bpe_ranks:
162 |                 break
163 |             first, second = bigram
164 |             new_word = []
165 |             i = 0
166 |             while i < len(word):
167 |                 try:
168 |                     j = word.index(first, i)
169 |                     new_word.extend(word[i:j])
170 |                     i = j
171 |                 except:
172 |                     new_word.extend(word[i:])
173 |                     break
174 | 
175 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
176 |                     new_word.append(first+second)
177 |                     i += 2
178 |                 else:
179 |                     new_word.append(word[i])
180 |                     i += 1
181 |             new_word = tuple(new_word)
182 |             word = new_word
183 |             if len(word) == 1:
184 |                 break
185 |             else:
186 |                 pairs = get_pairs(word)
187 |         word = '@@ '.join(word)
188 |         word = word[:-4]
189 |         self.cache[token] = word
190 |         return word
191 | 
192 |     def _tokenize(self, text):
193 |         """ Tokenize a string.
194 |         """
195 |         split_tokens = []
196 | 
197 |         words = re.findall(r'\S+\n?', text)
198 | 
199 |         for token in words:
200 |             split_tokens.extend([t for t in self.bpe(token).split(' ')])
201 |         return split_tokens
202 | 
203 |     def _convert_token_to_id(self, token):
204 |         """ Converts a token (str/unicode) in an id using the vocab. """
205 |         return self.encoder.get(token, self.encoder.get(self.unk_token))
206 | 
207 |     def _convert_id_to_token(self, index):
208 |         """Converts an index (integer) in a token (string/unicode) using the vocab."""
209 |         return self.decoder.get(index, self.unk_token)
210 | 
211 |     def convert_tokens_to_string(self, tokens):
212 |         """ Converts a sequence of tokens (string) in a single string. """
213 |         out_string = ' '.join(tokens).replace('@@ ', '').strip()
214 |         return out_string
215 | 
216 |     def save_vocabulary(self, save_directory):
217 |         """Save the tokenizer vocabulary and merge files to a directory."""
218 |         if not os.path.isdir(save_directory):
219 |             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
220 |             return
221 |         vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
222 |         merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
223 | 
224 |         with open(vocab_file, 'w', encoding='utf-8') as f:
225 |             f.write(json.dumps(self.encoder, ensure_ascii=False))
226 | 
227 |         index = 0
228 |         with open(merge_file, "w", encoding="utf-8") as writer:
229 |             writer.write(u'#version: 0.2\n')
230 |             for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
231 |                 if index != token_index:
232 |                     logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
233 |                                    " Please check that the tokenizer is not corrupted!".format(merge_file))
234 |                     index = token_index
235 |                 writer.write(' '.join(bpe_tokens) + u'\n')
236 |                 index += 1
237 | 
238 |         return vocab_file, merge_file
239 | 
240 |     # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
241 |     #     filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens))
242 |     #     tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens)
243 |     #     tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far)
244 |     #     return ''.join(tokens_generated_so_far)
245 | 


--------------------------------------------------------------------------------
/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert RoBERTa checkpoint."""
 16 | 
 17 | from __future__ import absolute_import, division, print_function
 18 | 
 19 | import argparse
 20 | import logging
 21 | import numpy as np
 22 | import torch
 23 | 
 24 | from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 25 | from fairseq.modules import TransformerSentenceEncoderLayer
 26 | from transformers.modeling_bert import (BertConfig, BertEncoder,
 27 |                                         BertIntermediate, BertLayer,
 28 |                                         BertModel, BertOutput,
 29 |                                         BertSelfAttention,
 30 |                                         BertSelfOutput)
 31 | from transformers.modeling_roberta import (RobertaEmbeddings,
 32 |                                            RobertaForMaskedLM,
 33 |                                            RobertaForSequenceClassification,
 34 |                                            RobertaModel)
 35 | 
 36 | logging.basicConfig(level=logging.INFO)
 37 | logger = logging.getLogger(__name__)
 38 | 
 39 | SAMPLE_TEXT = 'Hello world! cécé herlolip'
 40 | 
 41 | 
 42 | def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head):
 43 |     """
 44 |     Copy/paste/tweak roberta's weights to our BERT structure.
 45 |     """
 46 |     roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
 47 |     roberta.eval()  # disable dropout
 48 |     config = BertConfig(
 49 |         vocab_size_or_config_json_file=50265,
 50 |         hidden_size=roberta.args.encoder_embed_dim,
 51 |         num_hidden_layers=roberta.args.encoder_layers,
 52 |         num_attention_heads=roberta.args.encoder_attention_heads,
 53 |         intermediate_size=roberta.args.encoder_ffn_embed_dim,
 54 |         max_position_embeddings=514,
 55 |         type_vocab_size=1,
 56 |         layer_norm_eps=1e-5, # PyTorch default used in fairseq
 57 |     )
 58 |     if classification_head:
 59 |         config.num_labels = roberta.args.num_classes
 60 |     print("Our BERT config:", config)
 61 | 
 62 |     model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config)
 63 |     model.eval()
 64 | 
 65 |     # Now let's copy all the weights.
 66 |     # Embeddings
 67 |     roberta_sent_encoder = roberta.model.decoder.sentence_encoder
 68 |     model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
 69 |     model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
 70 |     model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight)  # just zero them out b/c RoBERTa doesn't use them.
 71 |     model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
 72 |     model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
 73 | 
 74 |     for i in range(config.num_hidden_layers):
 75 |         # Encoder: start of layer
 76 |         layer: BertLayer = model.roberta.encoder.layer[i]
 77 |         roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
 78 | 
 79 |         ### self attention
 80 |         self_attn: BertSelfAttention = layer.attention.self
 81 |         assert(
 82 |             roberta_layer.self_attn.in_proj_weight.shape == torch.Size((3 * config.hidden_size, config.hidden_size))
 83 |         )
 84 |         # we use three distinct linear layers so we split the source layer here.
 85 |         self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[:config.hidden_size, :]
 86 |         self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[:config.hidden_size]
 87 |         self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[config.hidden_size:2*config.hidden_size, :]
 88 |         self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[config.hidden_size:2*config.hidden_size]
 89 |         self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[2*config.hidden_size:, :]
 90 |         self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[2*config.hidden_size:]
 91 | 
 92 |         ### self-attention output
 93 |         self_output: BertSelfOutput = layer.attention.output
 94 |         assert(
 95 |             self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
 96 |         )
 97 |         self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
 98 |         self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
 99 |         self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
100 |         self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
101 | 
102 |         ### intermediate
103 |         intermediate: BertIntermediate = layer.intermediate
104 |         assert(
105 |             intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
106 |         )
107 |         intermediate.dense.weight = roberta_layer.fc1.weight
108 |         intermediate.dense.bias = roberta_layer.fc1.bias
109 | 
110 |         ### output
111 |         bert_output: BertOutput = layer.output
112 |         assert(
113 |             bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
114 |         )
115 |         bert_output.dense.weight = roberta_layer.fc2.weight
116 |         bert_output.dense.bias = roberta_layer.fc2.bias
117 |         bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
118 |         bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
119 |         #### end of layer
120 |     
121 |     if classification_head:
122 |         model.classifier.dense.weight = roberta.model.classification_heads['mnli'].dense.weight
123 |         model.classifier.dense.bias = roberta.model.classification_heads['mnli'].dense.bias
124 |         model.classifier.out_proj.weight = roberta.model.classification_heads['mnli'].out_proj.weight
125 |         model.classifier.out_proj.bias = roberta.model.classification_heads['mnli'].out_proj.bias
126 |     else:
127 |         # LM Head
128 |         model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight
129 |         model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
130 |         model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
131 |         model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
132 |         model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
133 |         model.lm_head.bias = roberta.model.decoder.lm_head.bias
134 | 
135 |     # Let's check that we get the same results.
136 |     input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1
137 | 
138 |     our_output = model(input_ids)[0]
139 |     if classification_head:
140 |         their_output = roberta.model.classification_heads['mnli'](roberta.extract_features(input_ids))
141 |     else:
142 |         their_output = roberta.model(input_ids)[0]
143 |     print(our_output.shape, their_output.shape)
144 |     max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
145 |     print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
146 |     success = torch.allclose(our_output, their_output, atol=1e-3)
147 |     print(
148 |         "Do both models output the same tensors?",
149 |         "🔥" if success else "💩"
150 |     )
151 |     if not success:
152 |         raise Exception("Something went wRoNg")
153 | 
154 |     print(f"Saving model to {pytorch_dump_folder_path}")
155 |     model.save_pretrained(pytorch_dump_folder_path)
156 | 
157 | 
158 | if __name__ == "__main__":
159 |     parser = argparse.ArgumentParser()
160 |     ## Required parameters
161 |     parser.add_argument("--roberta_checkpoint_path",
162 |                         default = None,
163 |                         type = str,
164 |                         required = True,
165 |                         help = "Path the official PyTorch dump.")
166 |     parser.add_argument("--pytorch_dump_folder_path",
167 |                         default = None,
168 |                         type = str,
169 |                         required = True,
170 |                         help = "Path to the output PyTorch model.")
171 |     parser.add_argument("--classification_head",
172 |                         action = "store_true",
173 |                         help = "Whether to convert a final classification head.")
174 |     args = parser.parse_args()
175 |     convert_roberta_checkpoint_to_pytorch(
176 |         args.roberta_checkpoint_path,
177 |         args.pytorch_dump_folder_path,
178 |         args.classification_head
179 |     )
180 | 
181 | 


--------------------------------------------------------------------------------
/transformers/tokenization_auto.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Auto Model class. """
 16 | 
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | 
 19 | import logging
 20 | 
 21 | from .tokenization_bert import BertTokenizer
 22 | from .tokenization_bert_japanese import BertJapaneseTokenizer
 23 | from .tokenization_openai import OpenAIGPTTokenizer
 24 | from .tokenization_gpt2 import GPT2Tokenizer
 25 | from .tokenization_ctrl import CTRLTokenizer
 26 | from .tokenization_transfo_xl import TransfoXLTokenizer
 27 | from .tokenization_xlnet import XLNetTokenizer
 28 | from .tokenization_xlm import XLMTokenizer
 29 | from .tokenization_roberta import RobertaTokenizer
 30 | from .tokenization_distilbert import DistilBertTokenizer
 31 | from .tokenization_camembert import CamembertTokenizer
 32 | from .tokenization_albert import AlbertTokenizer
 33 | 
 34 | logger = logging.getLogger(__name__)
 35 | 
 36 | class AutoTokenizer(object):
 37 |     r""":class:`~transformers.AutoTokenizer` is a generic tokenizer class
 38 |         that will be instantiated as one of the tokenizer classes of the library
 39 |         when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
 40 |         class method.
 41 | 
 42 |         The `from_pretrained()` method take care of returning the correct tokenizer class instance
 43 |         using pattern matching on the `pretrained_model_name_or_path` string.
 44 | 
 45 |         The tokenizer class to instantiate is selected as the first pattern matching
 46 |         in the `pretrained_model_name_or_path` string (in the following order):
 47 |             - contains `distilbert`: DistilBertTokenizer (DistilBert model)
 48 |             - contains `albert`: AlbertTokenizer (ALBERT model)
 49 |             - contains `camembert`: CamembertTokenizer (CamemBERT model)
 50 |             - contains `roberta`: RobertaTokenizer (RoBERTa model)
 51 |             - contains `bert`: BertTokenizer (Bert model)
 52 |             - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
 53 |             - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
 54 |             - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
 55 |             - contains `xlnet`: XLNetTokenizer (XLNet model)
 56 |             - contains `xlm`: XLMTokenizer (XLM model)
 57 |             - contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
 58 | 
 59 |         This class cannot be instantiated using `__init__()` (throw an error).
 60 |     """
 61 |     def __init__(self):
 62 |         raise EnvironmentError("AutoTokenizer is designed to be instantiated "
 63 |             "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.")
 64 | 
 65 |     @classmethod
 66 |     def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
 67 |         r""" Instantiate a one of the tokenizer classes of the library
 68 |         from a pre-trained model vocabulary.
 69 | 
 70 |         The tokenizer class to instantiate is selected as the first pattern matching
 71 |         in the `pretrained_model_name_or_path` string (in the following order):
 72 |             - contains `distilbert`: DistilBertTokenizer (DistilBert model)
 73 |             - contains `albert`: AlbertTokenizer (ALBERT model)
 74 |             - contains `camembert`: CamembertTokenizer (CamemBERT model)
 75 |             - contains `roberta`: RobertaTokenizer (RoBERTa model)
 76 |             - contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model)
 77 |             - contains `bert`: BertTokenizer (Bert model)
 78 |             - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
 79 |             - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
 80 |             - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
 81 |             - contains `xlnet`: XLNetTokenizer (XLNet model)
 82 |             - contains `xlm`: XLMTokenizer (XLM model)
 83 |             - contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
 84 | 
 85 |         Params:
 86 |             pretrained_model_name_or_path: either:
 87 | 
 88 |                 - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
 89 |                 - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
 90 |                 - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
 91 |                 - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
 92 | 
 93 |             cache_dir: (`optional`) string:
 94 |                 Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
 95 | 
 96 |             force_download: (`optional`) boolean, default False:
 97 |                 Force to (re-)download the vocabulary files and override the cached versions if they exists.
 98 | 
 99 |             resume_download: (`optional`) boolean, default False:
100 |                 Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
101 | 
102 |             proxies: (`optional`) dict, default None:
103 |                 A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
104 |                 The proxies are used on each request.
105 | 
106 |             inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
107 | 
108 |             kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
109 | 
110 |         Examples::
111 | 
112 |             # Download vocabulary from S3 and cache.
113 |             tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
114 | 
115 |             # Download vocabulary from S3 (user-uploaded) and cache.
116 |             tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
117 | 
118 |             # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
119 |             tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
120 | 
121 |         """
122 |         if 'distilbert' in pretrained_model_name_or_path:
123 |             return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
124 |         elif 'albert' in pretrained_model_name_or_path:
125 |             return AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
126 |         elif 'camembert' in pretrained_model_name_or_path:
127 |             return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
128 |         elif 'roberta' in pretrained_model_name_or_path:
129 |             return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
130 |         elif 'bert-base-japanese' in pretrained_model_name_or_path:
131 |             return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
132 |         elif 'bert' in pretrained_model_name_or_path:
133 |             return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
134 |         elif 'openai-gpt' in pretrained_model_name_or_path:
135 |             return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
136 |         elif 'gpt2' in pretrained_model_name_or_path:
137 |             return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
138 |         elif 'transfo-xl' in pretrained_model_name_or_path:
139 |             return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
140 |         elif 'xlnet' in pretrained_model_name_or_path:
141 |             return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
142 |         elif 'xlm' in pretrained_model_name_or_path:
143 |             return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
144 |         elif 'ctrl' in pretrained_model_name_or_path:
145 |             return CTRLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
146 |         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
147 |                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
148 |                          "'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
149 | 


--------------------------------------------------------------------------------
/transformers/configuration_auto.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Auto Model class. """
 16 | 
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | 
 19 | import logging
 20 | 
 21 | from .configuration_bert import BertConfig
 22 | from .configuration_openai import OpenAIGPTConfig
 23 | from .configuration_gpt2 import GPT2Config
 24 | from .configuration_transfo_xl import TransfoXLConfig
 25 | from .configuration_xlnet import XLNetConfig
 26 | from .configuration_xlm import XLMConfig
 27 | from .configuration_roberta import RobertaConfig
 28 | from .configuration_distilbert import DistilBertConfig
 29 | from .configuration_ctrl import CTRLConfig
 30 | from .configuration_camembert import CamembertConfig
 31 | from .configuration_albert import AlbertConfig
 32 | 
 33 | logger = logging.getLogger(__name__)
 34 | 
 35 | 
 36 | class AutoConfig(object):
 37 |     r""":class:`~transformers.AutoConfig` is a generic configuration class
 38 |         that will be instantiated as one of the configuration classes of the library
 39 |         when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
 40 |         class method.
 41 | 
 42 |         The `from_pretrained()` method take care of returning the correct model class instance
 43 |         using pattern matching on the `pretrained_model_name_or_path` string.
 44 | 
 45 |         The base model class to instantiate is selected as the first pattern matching
 46 |         in the `pretrained_model_name_or_path` string (in the following order):
 47 |             - contains `distilbert`: DistilBertConfig (DistilBERT model)
 48 |             - contains `albert`: AlbertConfig (ALBERT model)
 49 |             - contains `camembert`: CamembertConfig (CamemBERT model)
 50 |             - contains `roberta`: RobertaConfig (RoBERTa model)
 51 |             - contains `bert`: BertConfig (Bert model)
 52 |             - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
 53 |             - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
 54 |             - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
 55 |             - contains `xlnet`: XLNetConfig (XLNet model)
 56 |             - contains `xlm`: XLMConfig (XLM model)
 57 |             - contains `ctrl` : CTRLConfig (CTRL model)
 58 |         This class cannot be instantiated using `__init__()` (throw an error).
 59 |     """
 60 |     def __init__(self):
 61 |         raise EnvironmentError("AutoConfig is designed to be instantiated "
 62 |             "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
 63 | 
 64 |     @classmethod
 65 |     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 66 |         r""" Instantiate a one of the configuration classes of the library
 67 |         from a pre-trained model configuration.
 68 | 
 69 |         The configuration class to instantiate is selected as the first pattern matching
 70 |         in the `pretrained_model_name_or_path` string (in the following order):
 71 |             - contains `distilbert`: DistilBertConfig (DistilBERT model)
 72 |             - contains `albert`: AlbertConfig (ALBERT model)
 73 |             - contains `camembert`: CamembertConfig (CamemBERT model)
 74 |             - contains `roberta`: RobertaConfig (RoBERTa model)
 75 |             - contains `bert`: BertConfig (Bert model)
 76 |             - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
 77 |             - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
 78 |             - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
 79 |             - contains `xlnet`: XLNetConfig (XLNet model)
 80 |             - contains `xlm`: XLMConfig (XLM model)
 81 |             - contains `ctrl` : CTRLConfig (CTRL model)
 82 |         Params:
 83 |             pretrained_model_name_or_path: either:
 84 | 
 85 |                 - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
 86 |                 - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
 87 |                 - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
 88 |                 - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
 89 | 
 90 |             cache_dir: (`optional`) string:
 91 |                 Path to a directory in which a downloaded pre-trained model
 92 |                 configuration should be cached if the standard cache should not be used.
 93 | 
 94 |             kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
 95 | 
 96 |                 - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
 97 |                 - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
 98 | 
 99 |             force_download: (`optional`) boolean, default False:
100 |                 Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
101 | 
102 |             resume_download: (`optional`) boolean, default False:
103 |                 Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
104 | 
105 |             proxies: (`optional`) dict, default None:
106 |                 A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
107 |                 The proxies are used on each request.
108 | 
109 |             return_unused_kwargs: (`optional`) bool:
110 | 
111 |                 - If False, then this function returns just the final configuration object.
112 |                 - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
113 | 
114 |         Examples::
115 | 
116 |             config = AutoConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
117 |             config = AutoConfig.from_pretrained('./test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
118 |             config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
119 |             config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
120 |             assert config.output_attention == True
121 |             config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
122 |                                                                foo=False, return_unused_kwargs=True)
123 |             assert config.output_attention == True
124 |             assert unused_kwargs == {'foo': False}
125 | 
126 |         """
127 |         if 'distilbert' in pretrained_model_name_or_path:
128 |             return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
129 |         elif 'albert' in pretrained_model_name_or_path:
130 |             return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
131 |         elif 'camembert' in pretrained_model_name_or_path:
132 |             return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
133 |         elif 'roberta' in pretrained_model_name_or_path:
134 |             return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
135 |         elif 'bert' in pretrained_model_name_or_path:
136 |             return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
137 |         elif 'openai-gpt' in pretrained_model_name_or_path:
138 |             return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
139 |         elif 'gpt2' in pretrained_model_name_or_path:
140 |             return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
141 |         elif 'transfo-xl' in pretrained_model_name_or_path:
142 |             return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
143 |         elif 'xlnet' in pretrained_model_name_or_path:
144 |             return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
145 |         elif 'xlm' in pretrained_model_name_or_path:
146 |             return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
147 |         elif 'ctrl' in pretrained_model_name_or_path:
148 |             return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
149 |         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
150 |                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
151 |                          "'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
152 | 


--------------------------------------------------------------------------------
/transformers/optimization_tf.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Functions and classes related to optimization (weight updates)."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import re
 22 | 
 23 | import tensorflow as tf
 24 | 
 25 | 
 26 | class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
 27 |   """Applys a warmup schedule on a given learning rate decay schedule."""
 28 | 
 29 |   def __init__(
 30 |       self,
 31 |       initial_learning_rate,
 32 |       decay_schedule_fn,
 33 |       warmup_steps,
 34 |       power=1.0,
 35 |       name=None):
 36 |     super(WarmUp, self).__init__()
 37 |     self.initial_learning_rate = initial_learning_rate
 38 |     self.warmup_steps = warmup_steps
 39 |     self.power = power
 40 |     self.decay_schedule_fn = decay_schedule_fn
 41 |     self.name = name
 42 | 
 43 |   def __call__(self, step):
 44 |     with tf.name_scope(self.name or 'WarmUp') as name:
 45 |       # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
 46 |       # learning rate will be `global_step/num_warmup_steps * init_lr`.
 47 |       global_step_float = tf.cast(step, tf.float32)
 48 |       warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
 49 |       warmup_percent_done = global_step_float / warmup_steps_float
 50 |       warmup_learning_rate = (
 51 |           self.initial_learning_rate *
 52 |           tf.math.pow(warmup_percent_done, self.power))
 53 |       return tf.cond(global_step_float < warmup_steps_float,
 54 |                      lambda: warmup_learning_rate,
 55 |                      lambda: self.decay_schedule_fn(step),
 56 |                      name=name)
 57 | 
 58 |   def get_config(self):
 59 |     return {
 60 |         'initial_learning_rate': self.initial_learning_rate,
 61 |         'decay_schedule_fn': self.decay_schedule_fn,
 62 |         'warmup_steps': self.warmup_steps,
 63 |         'power': self.power,
 64 |         'name': self.name
 65 |     }
 66 | 
 67 | 
 68 | def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
 69 |   """Creates an optimizer with learning rate schedule."""
 70 |   # Implements linear decay of the learning rate.
 71 |   learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
 72 |       initial_learning_rate=init_lr,
 73 |       decay_steps=num_train_steps,
 74 |       end_learning_rate=0.0)
 75 |   if num_warmup_steps:
 76 |     learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
 77 |                               decay_schedule_fn=learning_rate_fn,
 78 |                               warmup_steps=num_warmup_steps)
 79 |   optimizer = AdamWeightDecay(
 80 |       learning_rate=learning_rate_fn,
 81 |       weight_decay_rate=0.01,
 82 |       beta_1=0.9,
 83 |       beta_2=0.999,
 84 |       epsilon=1e-6,
 85 |       exclude_from_weight_decay=['layer_norm', 'bias'])
 86 |   return optimizer
 87 | 
 88 | 
 89 | class AdamWeightDecay(tf.keras.optimizers.Adam):
 90 |   """Adam enables L2 weight decay and clip_by_global_norm on gradients.
 91 | 
 92 |   Just adding the square of the weights to the loss function is *not* the
 93 |   correct way of using L2 regularization/weight decay with Adam, since that will
 94 |   interact with the m and v parameters in strange ways.
 95 | 
 96 |   Instead we want ot decay the weights in a manner that doesn't interact with
 97 |   the m/v parameters. This is equivalent to adding the square of the weights to
 98 |   the loss with plain (non-momentum) SGD.
 99 |   """
100 | 
101 |   def __init__(self,
102 |                learning_rate=0.001,
103 |                beta_1=0.9,
104 |                beta_2=0.999,
105 |                epsilon=1e-7,
106 |                amsgrad=False,
107 |                weight_decay_rate=0.0,
108 |                include_in_weight_decay=None,
109 |                exclude_from_weight_decay=None,
110 |                name='AdamWeightDecay',
111 |                **kwargs):
112 |     super(AdamWeightDecay, self).__init__(
113 |         learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
114 |     self.weight_decay_rate = weight_decay_rate
115 |     self._include_in_weight_decay = include_in_weight_decay
116 |     self._exclude_from_weight_decay = exclude_from_weight_decay
117 | 
118 |   @classmethod
119 |   def from_config(cls, config):
120 |     """Creates an optimizer from its config with WarmUp custom object."""
121 |     custom_objects = {'WarmUp': WarmUp}
122 |     return super(AdamWeightDecay, cls).from_config(
123 |         config, custom_objects=custom_objects)
124 | 
125 |   def _prepare_local(self, var_device, var_dtype, apply_state):
126 |     super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
127 |                                                 apply_state)
128 |     apply_state['weight_decay_rate'] = tf.constant(
129 |         self.weight_decay_rate, name='adam_weight_decay_rate')
130 | 
131 |   def _decay_weights_op(self, var, learning_rate, apply_state):
132 |     do_decay = self._do_use_weight_decay(var.name)
133 |     if do_decay:
134 |       return var.assign_sub(
135 |           learning_rate * var *
136 |           apply_state['weight_decay_rate'],
137 |           use_locking=self._use_locking)
138 |     return tf.no_op()
139 | 
140 |   def apply_gradients(self, grads_and_vars, clip_norm, name=None):
141 |     grads, tvars = list(zip(*grads_and_vars))
142 |     (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
143 |     return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
144 | 
145 |   def _get_lr(self, var_device, var_dtype, apply_state):
146 |     """Retrieves the learning rate with the given state."""
147 |     if apply_state is None:
148 |       return self._decayed_lr_t[var_dtype], {}
149 | 
150 |     apply_state = apply_state or {}
151 |     coefficients = apply_state.get((var_device, var_dtype))
152 |     if coefficients is None:
153 |       coefficients = self._fallback_apply_state(var_device, var_dtype)
154 |       apply_state[(var_device, var_dtype)] = coefficients
155 | 
156 |     return coefficients['lr_t'], dict(apply_state=apply_state)
157 | 
158 |   def _resource_apply_dense(self, grad, var, apply_state=None):
159 |     lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
160 |     decay = self._decay_weights_op(var, lr_t, apply_state)
161 |     with tf.control_dependencies([decay]):
162 |       return super(AdamWeightDecay, self)._resource_apply_dense(
163 |           grad, var, **kwargs)
164 | 
165 |   def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
166 |     lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
167 |     decay = self._decay_weights_op(var, lr_t, apply_state)
168 |     with tf.control_dependencies([decay]):
169 |       return super(AdamWeightDecay, self)._resource_apply_sparse(
170 |           grad, var, indices, **kwargs)
171 | 
172 |   def get_config(self):
173 |     config = super(AdamWeightDecay, self).get_config()
174 |     config.update({
175 |         'weight_decay_rate': self.weight_decay_rate,
176 |     })
177 |     return config
178 | 
179 |   def _do_use_weight_decay(self, param_name):
180 |     """Whether to use L2 weight decay for `param_name`."""
181 |     if self.weight_decay_rate == 0:
182 |       return False
183 | 
184 |     if self._include_in_weight_decay:
185 |       for r in self._include_in_weight_decay:
186 |         if re.search(r, param_name) is not None:
187 |           return True
188 | 
189 |     if self._exclude_from_weight_decay:
190 |       for r in self._exclude_from_weight_decay:
191 |         if re.search(r, param_name) is not None:
192 |           return False
193 |     return True
194 | 
195 | 
196 | ## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
197 | class GradientAccumulator(object):
198 |     """Distribution strategies-aware gradient accumulation utility."""
199 | 
200 |     def __init__(self):
201 |         """Initializes the accumulator."""
202 |         self._gradients = []
203 |         self._accum_steps = tf.Variable(
204 |             initial_value=0,
205 |             dtype=tf.int64,
206 |             trainable=False,
207 |             aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
208 | 
209 |     @property
210 |     def step(self):
211 |         """Number of accumulated steps."""
212 |         return self._accum_steps.value()
213 | 
214 |     @property
215 |     def gradients(self):
216 |         """The accumulated gradients."""
217 |         return list(gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients())
218 | 
219 |     def __call__(self, gradients):
220 |         """Accumulates :obj:`gradients`."""
221 |         if not self._gradients:
222 |             self._gradients.extend([tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient for gradient in gradients])
223 | 
224 |         if len(gradients) != len(self._gradients):
225 |             raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
226 | 
227 |         for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
228 |             if accum_gradient is not None:
229 |                 accum_gradient.assign_add(gradient)
230 | 
231 |         self._accum_steps.assign_add(1)
232 | 
233 |     def reset(self):
234 |         """Resets the accumulated gradients."""
235 |         if self._gradients:
236 |             self._accum_steps.assign(0)
237 | 
238 |         for gradient in self._get_replica_gradients():
239 |             if gradient is not None:
240 |                 gradient.assign(tf.zeros_like(gradient))
241 | 
242 |     def _get_replica_gradients(self):
243 |         if tf.distribute.has_strategy():
244 |             # In a replica context, we want to accumulate gradients on each replica
245 |             # without synchronization, so we directly assign the value of the
246 |             # current replica.
247 |             replica_context = tf.distribute.get_replica_context()
248 | 
249 |             if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
250 |                 return self._gradients
251 | 
252 |             return (gradient.device_map.select_for_current_replica(gradient.values, replica_context) for gradient in self._gradients)
253 |         else:
254 |             return self._gradients
255 | 


--------------------------------------------------------------------------------