├── third_party
    └── ud-conversion-tools
    │   ├── lib
    │       └── __init__.py
    │   └── conllu_to_conll.py
├── init.sh
├── src
    └── transformers
    │   ├── commands
    │       ├── __init__.py
    │       ├── download.py
    │       ├── env.py
    │       ├── run.py
    │       ├── train.py
    │       ├── convert.py
    │       └── serving.py
    │   ├── data
    │       ├── processors
    │       │   ├── __init__.py
    │       │   └── xnli.py
    │       ├── __init__.py
    │       └── metrics
    │       │   ├── __init__.py
    │       │   └── mlqa_evaluation_v1.py
    │   ├── tokenization_bart.py
    │   ├── activations.py
    │   ├── configuration_mmbt.py
    │   ├── configuration_camembert.py
    │   ├── utils_encoder_decoder.py
    │   ├── configuration_xlm_roberta.py
    │   ├── convert_t5_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_bert_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_albert_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_openai_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py
    │   ├── configuration_roberta.py
    │   ├── tokenization_distilbert.py
    │   ├── configuration_bart.py
    │   ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py
    │   ├── configuration_t5.py
    │   ├── convert_bert_pytorch_checkpoint_to_original_tf.py
    │   ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py
    │   ├── modeling_tf_camembert.py
    │   ├── modeling_tf_xlm_roberta.py
    │   ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
    │   ├── tokenization_flaubert.py
    │   ├── modeling_camembert.py
    │   ├── configuration_ctrl.py
    │   ├── hf_api.py
    │   ├── configuration_distilbert.py
    │   ├── tokenization_t5.py
    │   ├── configuration_albert.py
    │   ├── optimization.py
    │   └── modeling_tf_transfo_xl_utilities.py
├── scripts
    ├── download_model.sh
    ├── preprocess_panx.sh
    └── preprocess_udpos.sh
├── LICENSE
├── transformers-cli
├── dockers
    └── Dockerfile
├── README.md
├── setup.py
├── eval.sh
└── train.sh


/third_party/ud-conversion-tools/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/init.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export DATA_ROOT=/ssd/data
 4 | 
 5 | export N_GPU=`nvidia-smi -L | wc -l`
 6 | export PYTORCH_PRETRAINED_BERT_CACHE=$DATA_ROOT/pretrained-cache
 7 | export OMP_NUM_THREADS=4
 8 | export MKL_NUM_THREADS=4
 9 | 
10 | pip install --user --editable ./
11 | mkdir -p $DATA_ROOT/outputs
12 | 


--------------------------------------------------------------------------------
/src/transformers/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from argparse import ArgumentParser
 3 | 
 4 | 
 5 | class BaseTransformersCLICommand(ABC):
 6 |     @staticmethod
 7 |     @abstractmethod
 8 |     def register_subcommand(parser: ArgumentParser):
 9 |         raise NotImplementedError()
10 | 
11 |     @abstractmethod
12 |     def run(self):
13 |         raise NotImplementedError()
14 | 


--------------------------------------------------------------------------------
/scripts/download_model.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | DOWNLOAD=$1
 5 | 
 6 | BLOB='https://convaisharables.blob.core.windows.net/filter/data/outputs/phase2'
 7 | 
 8 | download_model() {
 9 |     mkdir -p $DOWNLOAD/outputs/phase2/$task
10 |     for file in config.json pytorch_model.bin sentencepiece.bpe.model special_tokens_map.json tokenizer_config.json; do
11 |         wget $BLOB/$task/$file -O $DOWNLOAD/outputs/phase2/$task/$file
12 |     done
13 | }
14 | 


--------------------------------------------------------------------------------
/src/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | from .xglue import xglue_convert_examples_to_features, xglue_output_modes, xglue_processors, xglue_tasks_num_labels
 6 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
 7 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
 8 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
 9 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
10 | 


--------------------------------------------------------------------------------
/src/transformers/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | from .metrics import is_sklearn_available
 6 | from .processors import (
 7 |     DataProcessor,
 8 |     InputExample,
 9 |     InputFeatures,
10 |     SingleSentenceClassificationProcessor,
11 |     SquadExample,
12 |     SquadFeatures,
13 |     SquadV1Processor,
14 |     SquadV2Processor,
15 |     glue_convert_examples_to_features,
16 |     glue_output_modes,
17 |     glue_processors,
18 |     glue_tasks_num_labels,
19 | 
20 |     xglue_convert_examples_to_features,
21 |     xglue_output_modes,
22 |     xglue_processors,
23 |     xglue_tasks_num_labels,
24 | 
25 |     squad_convert_examples_to_features,
26 |     xnli_output_modes,
27 |     xnli_processors,
28 |     xnli_tasks_num_labels,
29 | )
30 | 
31 | 
32 | if is_sklearn_available():
33 |     from .metrics import glue_compute_metrics, xnli_compute_metrics, xglue_compute_metrics
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 yuwfan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/transformers-cli:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from argparse import ArgumentParser
 3 | 
 4 | from transformers.commands.convert import ConvertCommand
 5 | from transformers.commands.download import DownloadCommand
 6 | from transformers.commands.env import EnvironmentCommand
 7 | from transformers.commands.run import RunCommand
 8 | from transformers.commands.serving import ServeCommand
 9 | from transformers.commands.user import UserCommands
10 | 
11 | if __name__ == '__main__':
12 |     parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli <command> [<args>]')
13 |     commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
14 | 
15 |     # Register commands
16 |     ConvertCommand.register_subcommand(commands_parser)
17 |     DownloadCommand.register_subcommand(commands_parser)
18 |     EnvironmentCommand.register_subcommand(commands_parser)
19 |     RunCommand.register_subcommand(commands_parser)
20 |     ServeCommand.register_subcommand(commands_parser)
21 |     UserCommands.register_subcommand(commands_parser)
22 | 
23 |     # Let's go
24 |     args = parser.parse_args()
25 | 
26 |     if not hasattr(args, 'func'):
27 |         parser.print_help()
28 |         exit(1)
29 | 
30 |     # Run
31 |     service = args.func(args)
32 |     service.run()
33 | 


--------------------------------------------------------------------------------
/src/transformers/commands/download.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | from transformers.commands import BaseTransformersCLICommand
 4 | 
 5 | 
 6 | def download_command_factory(args):
 7 |     return DownloadCommand(args.model, args.cache_dir, args.force)
 8 | 
 9 | 
10 | class DownloadCommand(BaseTransformersCLICommand):
11 |     @staticmethod
12 |     def register_subcommand(parser: ArgumentParser):
13 |         download_parser = parser.add_parser("download")
14 |         download_parser.add_argument(
15 |             "--cache-dir", type=str, default=None, help="Path to location to store the models"
16 |         )
17 |         download_parser.add_argument(
18 |             "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
19 |         )
20 |         download_parser.add_argument("model", type=str, help="Name of the model to download")
21 |         download_parser.set_defaults(func=download_command_factory)
22 | 
23 |     def __init__(self, model: str, cache: str, force: bool):
24 |         self._model = model
25 |         self._cache = cache
26 |         self._force = force
27 | 
28 |     def run(self):
29 |         from transformers import AutoModel, AutoTokenizer
30 | 
31 |         AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
32 |         AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
33 | 


--------------------------------------------------------------------------------
/src/transformers/tokenization_bart.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from .tokenization_roberta import RobertaTokenizer
17 | 
18 | 
19 | # vocab and merges same as roberta
20 | vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
21 | merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
22 | _all_bart_models = ["bart-large", "bart-large-mnli", "bart-large-cnn"]
23 | 
24 | 
25 | class BartTokenizer(RobertaTokenizer):
26 |     # merges and vocab same as Roberta
27 |     max_model_input_sizes = {m: 1024 for m in _all_bart_models}
28 |     pretrained_vocab_files_map = {
29 |         "vocab_file": {m: vocab_url for m in _all_bart_models},
30 |         "merges_file": {m: merges_url for m in _all_bart_models},
31 |     }
32 | 


--------------------------------------------------------------------------------
/src/transformers/activations.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | def swish(x):
 8 |     return x * torch.sigmoid(x)
 9 | 
10 | 
11 | def _gelu_python(x):
12 |     """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
13 |         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
14 |         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
15 |         This is now written in C in torch.nn.functional
16 |         Also see https://arxiv.org/abs/1606.08415
17 |     """
18 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
19 | 
20 | 
21 | if torch.__version__ < "1.4.0":
22 |     gelu = _gelu_python
23 | else:
24 |     gelu = F.gelu
25 | 
26 | 
27 | def gelu_new(x):
28 |     """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
29 |         Also see https://arxiv.org/abs/1606.08415
30 |     """
31 |     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
32 | 
33 | 
34 | ACT2FN = {
35 |     "relu": F.relu,
36 |     "swish": swish,
37 |     "gelu": gelu,
38 |     "tanh": F.tanh,
39 |     "gelu_new": gelu_new,
40 | }
41 | 
42 | 
43 | def get_activation(activation_string):
44 |     if activation_string in ACT2FN:
45 |         return ACT2FN[activation_string]
46 |     else:
47 |         raise KeyError(
48 |             "function {} not found in ACT2FN mapping {} or torch.nn.functional".format(
49 |                 activation_string, list(ACT2FN.keys())
50 |             )
51 |         )
52 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_mmbt.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # Copyright (c) HuggingFace Inc. team.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ MMBT configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | 
22 | logger = logging.getLogger(__name__)
23 | 
24 | 
25 | class MMBTConfig(object):
26 |     """Configuration class to store the configuration of a `MMBT Model`.
27 | 
28 |     Args:
29 |         config (:obj:`~transformers.PreTrainedConfig`):
30 |             Config of the underlying Transformer models. Its values are
31 |             copied over to use a single config.
32 |         num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
33 |             Size of final Linear layer for classification.
34 |         modal_hidden_size (:obj:`int`, optional, defautls to 2048):
35 |             Embedding dimension of the non-text modality encoder.
36 |     """
37 | 
38 |     def __init__(self, config, num_labels=None, modal_hidden_size=2048):
39 |         self.__dict__ = config.__dict__
40 |         self.modal_hidden_size = modal_hidden_size
41 |         if num_labels:
42 |             self.num_labels = num_labels
43 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_camembert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ CamemBERT configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | from .configuration_roberta import RobertaConfig
22 | 
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27 |     "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
28 |     "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json",
29 |     "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json",
30 | }
31 | 
32 | 
33 | class CamembertConfig(RobertaConfig):
34 |     """
35 |     This class overrides :class:`~transformers.RobertaConfig`. Please check the
36 |     superclass for the appropriate documentation alongside usage examples.
37 |     """
38 | 
39 |     pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
40 |     model_type = "camembert"
41 | 


--------------------------------------------------------------------------------
/scripts/preprocess_panx.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2020 Google and DeepMind.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | REPO=$PWD
17 | MODEL=${1:-bert-base-multilingual-cased}
18 | DATA_DIR=${2:-"$REPO/download/"}
19 | 
20 | TASK='panx'
21 | MAXL=128
22 | LANGS="ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu"
23 | LC=""
24 | if [ $MODEL == "bert-base-multilingual-cased" ]; then
25 |   MODEL_TYPE="bert"
26 | elif [ $MODEL == "xlm-mlm-100-1280" ] || [ $MODEL == "xlm-mlm-tlm-xnli15-1024" ]; then
27 |   MODEL_TYPE="xlm"
28 |   LC=" --do_lower_case"
29 | elif [ $MODEL == "xlm-roberta-large" ] || [ $MODEL == "xom-roberta-base" ]; then
30 |   MODEL_TYPE="xlmr"
31 | fi
32 | SAVE_DIR="$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAXL}"
33 | mkdir -p $SAVE_DIR
34 | python3 $REPO/utils_preprocess.py \
35 |   --data_dir $DATA_DIR/$TASK/ \
36 |   --task panx_tokenize \
37 |   --model_name_or_path $MODEL \
38 |   --model_type $MODEL_TYPE \
39 |   --max_len $MAXL \
40 |   --output_dir $SAVE_DIR \
41 |   --languages $LANGS $LC >> $SAVE_DIR/preprocess.log
42 | if [ ! -f $SAVE_DIR/labels.txt ]; then
43 |   cat $SAVE_DIR/*/*.${MODEL} | cut -f 2 | grep -v "^$" | sort | uniq > $SAVE_DIR/labels.txt
44 | fi
45 | 


--------------------------------------------------------------------------------
/scripts/preprocess_udpos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2020 Google and DeepMind.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | REPO=$PWD
17 | MODEL=${1:-bert-base-multilingual-cased}
18 | DATA_DIR=${2:-"$REPO/download/"}
19 | 
20 | TASK='udpos'
21 | MAXL=128
22 | LANGS='af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh'
23 | LC=""
24 | if [ $MODEL == "bert-base-multilingual-cased" ]; then
25 |   MODEL_TYPE="bert"
26 | elif [ $MODEL == "xlm-mlm-100-1280" ] || [ $MODEL == "xlm-mlm-tlm-xnli15-1024" ]; then
27 |   MODEL_TYPE="xlm"
28 |   LC=" --do_lower_case"
29 | elif [ $MODEL == "xlm-roberta-large" ] || [ $MODEL == "xlm-roberta-base" ]; then
30 |   MODEL_TYPE="xlmr"
31 | fi
32 | 
33 | SAVE_DIR="$DATA_DIR/${TASK}/udpos_processed_maxlen${MAXL}"
34 | mkdir -p $SAVE_DIR
35 | python3 $REPO/utils_preprocess.py \
36 |   --data_dir $DATA_DIR/${TASK}/ \
37 |   --task udpos_tokenize \
38 |   --model_name_or_path $MODEL \
39 |   --model_type $MODEL_TYPE \
40 |   --max_len $MAXL \
41 |   --output_dir $SAVE_DIR \
42 |   --languages $LANGS $LC >> $SAVE_DIR/process.log
43 | if [ ! -f $SAVE_DIR/labels.txt ]; then
44 |   echo "create label"
45 |   cat $SAVE_DIR/*/*.${MODEL} | cut -f 2 | grep -v "^$" | sort | uniq > $SAVE_DIR/labels.txt
46 | fi
47 | 


--------------------------------------------------------------------------------
/src/transformers/utils_encoder_decoder.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ Classes to support Encoder-Decoder architectures """
16 | 
17 | 
18 | def prepare_encoder_decoder_model_kwargs(**kwargs):
19 |     """ Prepare the encoder and decoder's keyword arguments.
20 | 
21 |     Keyword arguments come in 3 flavors:
22 |     - encoder-specific (prefixed by `encoder_`)
23 |     - decoder-specific (prefixed by `decoder_`)
24 |     - those that apply to the model as whole.
25 | 
26 |     We let the specific kwargs override the common ones in case of
27 |     conflict.
28 |     """
29 | 
30 |     kwargs_common = {
31 |         argument: value
32 |         for argument, value in kwargs.items()
33 |         if not argument.startswith("encoder_") and not argument.startswith("decoder_")
34 |     }
35 |     if "input_ids" in kwargs_common:
36 |         kwargs["encoder_input_ids"] = kwargs_common.pop("input_ids")
37 | 
38 |     decoder_kwargs = kwargs_common.copy()
39 |     encoder_kwargs = kwargs_common.copy()
40 |     encoder_kwargs.update(
41 |         {argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")}
42 |     )
43 |     decoder_kwargs.update(
44 |         {argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")}
45 |     )
46 |     decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None)
47 |     return encoder_kwargs, decoder_kwargs
48 | 


--------------------------------------------------------------------------------
/dockers/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel
 2 | 
 3 | RUN apt update && \
 4 |     apt install -y bash \
 5 |                    wget \
 6 |                    ssh \
 7 |                    vim \
 8 |                    build-essential \
 9 |                    git \
10 |                    curl \
11 |                    zip \
12 |                    unzip \
13 |                    ca-certificates \
14 |                    libopenblas-dev \
15 |                    libomp-dev \
16 |                    autoconf \
17 |                    automake \
18 |                    libtool \
19 |                    locales \
20 |                    python3 \
21 |                    python3-pip && \
22 |     rm -rf /var/lib/apt/lists
23 | 
24 | # uninstall Apex if present, twice to make absolutely sure :)
25 | RUN pip uninstall -y apex || :
26 | RUN pip uninstall -y apex || :
27 | RUN PWD_DIR=$(pwd)
28 | RUN cd $(mktemp -d)
29 | RUN git clone -q https://github.com/NVIDIA/apex.git
30 | RUN cd apex; git reset --hard de6378f5dae8fcf2879a4be8ecea8bbcb9e59d5; pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
31 | #RUN cd apex; pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" \
32 | #  --global-option="--deprecated_fused_adam" --global-option="--xentropy" \
33 | #  --global-option="--fast_multihead_attn" ./
34 | 
35 | RUN cd $(mktemp -d)
36 | RUN git clone https://github.com/neubig/kytea.git
37 | RUN cd kytea && autoreconf -i && ./configure && make && make install
38 | RUN pip install kytea
39 | 
40 | RUN pip install tensorboardX six numpy tqdm path.py pandas scikit-learn lmdb pyarrow py-lz4framed methodtools py-rouge pyrouge nltk seqeval sacremoses pythainlp jieba faiss urllib3==1.25.4 networkx==1.11
41 | 
42 | #RUN cd $(mktemp -d)
43 | #RUN git clone https://github.com/pytorch/fairseq
44 | #RUN cd fairseq; pip install --editable ./
45 | #RUN chmod -R 777 /opt/conda
46 | 
47 | RUN cd $PWD_DIR
48 | RUN rm -rf /var/lib/apt/lists/* && locale-gen "en_US.UTF-8"
49 | ENV LANG en_US.UTF-8
50 | ENV LANGUAGE en_US:en
51 | ENV LC_ALL en_US.UTF-8
52 | 
53 | CMD ["/bin/bash"]
54 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_xlm_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ XLM-RoBERTa configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | from .configuration_roberta import RobertaConfig
22 | 
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27 |     "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
28 |     "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
29 |     "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json",
30 |     "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json",
31 |     "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json",
32 |     "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json",
33 | }
34 | 
35 | 
36 | class XLMRobertaConfig(RobertaConfig):
37 |     """
38 |     This class overrides :class:`~transformers.RobertaConfig`. Please check the
39 |     superclass for the appropriate documentation alongside usage examples.
40 |     """
41 | 
42 |     pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
43 |     model_type = "xlm-roberta"
44 | 


--------------------------------------------------------------------------------
/src/transformers/commands/env.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | from argparse import ArgumentParser
 3 | 
 4 | from transformers import __version__ as version
 5 | from transformers import is_tf_available, is_torch_available
 6 | from transformers.commands import BaseTransformersCLICommand
 7 | 
 8 | 
 9 | def info_command_factory(_):
10 |     return EnvironmentCommand()
11 | 
12 | 
13 | class EnvironmentCommand(BaseTransformersCLICommand):
14 |     @staticmethod
15 |     def register_subcommand(parser: ArgumentParser):
16 |         download_parser = parser.add_parser("env")
17 |         download_parser.set_defaults(func=info_command_factory)
18 | 
19 |     def run(self):
20 |         pt_version = "not installed"
21 |         pt_cuda_available = "NA"
22 |         if is_torch_available():
23 |             import torch
24 | 
25 |             pt_version = torch.__version__
26 |             pt_cuda_available = torch.cuda.is_available()
27 | 
28 |         tf_version = "not installed"
29 |         tf_cuda_available = "NA"
30 |         if is_tf_available():
31 |             import tensorflow as tf
32 | 
33 |             tf_version = tf.__version__
34 |             try:
35 |                 # deprecated in v2.1
36 |                 tf_cuda_available = tf.test.is_gpu_available()
37 |             except AttributeError:
38 |                 # returns list of devices, convert to bool
39 |                 tf_cuda_available = bool(tf.config.list_physical_devices("GPU"))
40 | 
41 |         info = {
42 |             "`transformers` version": version,
43 |             "Platform": platform.platform(),
44 |             "Python version": platform.python_version(),
45 |             "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available),
46 |             "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available),
47 |             "Using GPU in script?": "<fill in>",
48 |             "Using distributed or parallel set-up in script?": "<fill in>",
49 |         }
50 | 
51 |         print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
52 |         print(self.format_dict(info))
53 | 
54 |         return info
55 | 
56 |     @staticmethod
57 |     def format_dict(d):
58 |         return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n"
59 | 


--------------------------------------------------------------------------------
/src/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert T5 checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import T5Config, T5Model, load_tf_weights_in_t5
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = T5Config.from_json_file(config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = T5Model(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_t5(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained T5 model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = BertConfig.from_json_file(bert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = BertForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_bert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--bert_config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained BERT model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert ALBERT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = AlbertConfig.from_json_file(albert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = AlbertForMaskedLM(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_albert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--albert_config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained ALBERT model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/src/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
30 |     # Construct model
31 |     if gpt2_config_file == "":
32 |         config = GPT2Config()
33 |     else:
34 |         config = GPT2Config.from_json_file(gpt2_config_file)
35 |     model = GPT2Model(config)
36 | 
37 |     # Load weights from numpy
38 |     load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
39 | 
40 |     # Save pytorch-model
41 |     pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
42 |     pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
43 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
44 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
45 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
46 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
47 |         f.write(config.to_json_string())
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser()
52 |     # Required parameters
53 |     parser.add_argument(
54 |         "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
55 |     )
56 |     parser.add_argument(
57 |         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
58 |     )
59 |     parser.add_argument(
60 |         "--gpt2_config_file",
61 |         default="",
62 |         type=str,
63 |         help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
64 |         "This specifies the model architecture.",
65 |     )
66 |     args = parser.parse_args()
67 |     convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)
68 | 


--------------------------------------------------------------------------------
/src/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
30 |     # Construct model
31 |     if openai_config_file == "":
32 |         config = OpenAIGPTConfig()
33 |     else:
34 |         config = OpenAIGPTConfig.from_json_file(openai_config_file)
35 |     model = OpenAIGPTModel(config)
36 | 
37 |     # Load weights from numpy
38 |     load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
39 | 
40 |     # Save pytorch-model
41 |     pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
42 |     pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
43 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
44 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
45 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
46 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
47 |         f.write(config.to_json_string())
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser()
52 |     # Required parameters
53 |     parser.add_argument(
54 |         "--openai_checkpoint_folder_path",
55 |         default=None,
56 |         type=str,
57 |         required=True,
58 |         help="Path to the TensorFlow checkpoint path.",
59 |     )
60 |     parser.add_argument(
61 |         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
62 |     )
63 |     parser.add_argument(
64 |         "--openai_config_file",
65 |         default="",
66 |         type=str,
67 |         help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
68 |         "This specifies the model architecture.",
69 |     )
70 |     args = parser.parse_args()
71 |     convert_openai_checkpoint_to_pytorch(
72 |         args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path
73 |     )
74 | 


--------------------------------------------------------------------------------
/third_party/ud-conversion-tools/conllu_to_conll.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from itertools import islice
 3 | from pathlib import Path
 4 | import argparse
 5 | import sys, copy
 6 | 
 7 | from lib.conll import CoNLLReader
 8 | 
 9 | def main():
10 |     parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
11 |     parser.add_argument('input', help="conllu file")
12 |     parser.add_argument('output', help="target file", type=Path)
13 |     parser.add_argument('--replace_subtokens_with_fused_forms', help="By default removes fused tokens", default=False, action="store_true")
14 |     parser.add_argument('--remove_deprel_suffixes', help="Restrict deprels to the common universal subset, e.g. nmod:tmod becomes nmod", default=False, action="store_true")
15 |     parser.add_argument('--remove_node_properties', help="space-separated list of node properties to remove: form, lemma, cpostag, postag, feats", choices=['form', 'lemma', 'cpostag','postag','feats'],  metavar='prop', type=str, nargs='+')
16 |     parser.add_argument('--lang', help="specify a language 2-letter code", default="default")
17 |     parser.add_argument('--output_format', choices=['conll2006', 'conll2009', 'conllu'], default="conll2006")
18 |     parser.add_argument('--remove_arabic_diacritics', help="remove Arabic short vowels", default=False, action="store_true")
19 |     parser.add_argument('--print_comments',default=False,action="store_true")
20 |     parser.add_argument('--print_fused_forms',default=False,action="store_true")
21 | 
22 |     args = parser.parse_args()
23 | 
24 |     if sys.version_info < (3,0):
25 |         print("Sorry, requires Python 3.x.") #suggestion: install anaconda python
26 |         sys.exit(1)
27 | 
28 |     POSRANKPRECEDENCEDICT = defaultdict(list)
29 |     POSRANKPRECEDENCEDICT["default"] = "VERB NOUN PROPN PRON ADJ NUM ADV INTJ AUX ADP DET PART CCONJ SCONJ X PUNCT ".split(" ")
30 |     # POSRANKPRECEDENCEDICT["de"] = "PROPN ADP DET ".split(" ")
31 |     POSRANKPRECEDENCEDICT["es"] = "VERB AUX PRON ADP DET".split(" ")
32 |     POSRANKPRECEDENCEDICT["fr"] = "VERB AUX PRON NOUN ADJ ADV ADP DET PART SCONJ CONJ".split(" ")
33 |     POSRANKPRECEDENCEDICT["it"] = "VERB AUX ADV PRON ADP DET INTJ".split(" ")
34 | 
35 |     if args.lang in POSRANKPRECEDENCEDICT:
36 |         current_pos_precedence_list = POSRANKPRECEDENCEDICT[args.lang]
37 |     else:
38 |         current_pos_precedence_list = POSRANKPRECEDENCEDICT["default"]
39 | 
40 |     cio = CoNLLReader()
41 |     orig_treebank = cio.read_conll_u(args.input)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT)
42 |     modif_treebank = copy.copy(orig_treebank)
43 | 
44 |     # As per Dec 2015 the args.lang variable is redundant once you have current_pos_precedence_list
45 |     # We keep it for future modifications, i.e. any language-specific modules
46 |     for s in modif_treebank:
47 |         # print('sentence', s.get_sentence_as_string(printid=True))
48 |         s.filter_sentence_content(args.replace_subtokens_with_fused_forms, args.lang, current_pos_precedence_list,args.remove_node_properties,args.remove_deprel_suffixes,args.remove_arabic_diacritics)
49 | 
50 |     cio.write_conll(modif_treebank,args.output, args.output_format,print_fused_forms=args.print_fused_forms, print_comments=args.print_comments)
51 | 
52 | if __name__ == "__main__":
53 |     main()


--------------------------------------------------------------------------------
/src/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import json
20 | import logging
21 | 
22 | import numpy
23 | import torch
24 | 
25 | from transformers import CONFIG_NAME, WEIGHTS_NAME
26 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES
27 | 
28 | 
29 | logging.basicConfig(level=logging.INFO)
30 | 
31 | 
32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
33 |     # Load checkpoint
34 |     chkpt = torch.load(xlm_checkpoint_path, map_location="cpu")
35 | 
36 |     state_dict = chkpt["model"]
37 | 
38 |     # We have the base model one level deeper than the original XLM repository
39 |     two_levels_state_dict = {}
40 |     for k, v in state_dict.items():
41 |         if "pred_layer" in k:
42 |             two_levels_state_dict[k] = v
43 |         else:
44 |             two_levels_state_dict["transformer." + k] = v
45 | 
46 |     config = chkpt["params"]
47 |     config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
48 | 
49 |     vocab = chkpt["dico_word2id"]
50 |     vocab = dict((s + "</w>" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items())
51 | 
52 |     # Save pytorch-model
53 |     pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
54 |     pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
55 |     pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"]
56 | 
57 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
58 |     torch.save(two_levels_state_dict, pytorch_weights_dump_path)
59 | 
60 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
61 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
62 |         f.write(json.dumps(config, indent=2) + "\n")
63 | 
64 |     print("Save vocab file to {}".format(pytorch_config_dump_path))
65 |     with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
66 |         f.write(json.dumps(vocab, indent=2) + "\n")
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     parser = argparse.ArgumentParser()
71 |     # Required parameters
72 |     parser.add_argument(
73 |         "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
74 |     )
75 |     parser.add_argument(
76 |         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
77 |     )
78 |     args = parser.parse_args()
79 |     convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
80 | 


--------------------------------------------------------------------------------
/src/transformers/data/processors/xnli.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ XNLI utils (dataset loading and evaluation) """
17 | 
18 | 
19 | import logging
20 | import os
21 | 
22 | from .utils import DataProcessor, InputExample
23 | 
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | 
28 | class XnliProcessor(DataProcessor):
29 |     """Processor for the XNLI dataset.
30 |     Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
31 | 
32 |     def __init__(self, language, train_language=None):
33 |         self.language = language
34 |         self.train_language = train_language
35 | 
36 |     def get_train_examples(self, data_dir):
37 |         """See base class."""
38 |         lg = self.language if self.train_language is None else self.train_language
39 |         lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg)))
40 |         examples = []
41 |         for (i, line) in enumerate(lines):
42 |             if i == 0:
43 |                 continue
44 |             guid = "%s-%s" % ("train", i)
45 |             text_a = line[0]
46 |             text_b = line[1]
47 |             label = "contradiction" if line[2] == "contradictory" else line[2]
48 |             assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
49 |             examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
50 |         return examples
51 | 
52 |     def get_test_examples(self, data_dir):
53 |         """See base class."""
54 |         lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
55 |         examples = []
56 |         for (i, line) in enumerate(lines):
57 |             if i == 0:
58 |                 continue
59 |             language = line[0]
60 |             if language != self.language:
61 |                 continue
62 |             guid = "%s-%s" % ("test", i)
63 |             text_a = line[6]
64 |             text_b = line[7]
65 |             label = line[1]
66 |             assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
67 |             examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
68 |         return examples
69 | 
70 |     def get_labels(self):
71 |         """See base class."""
72 |         return ["contradiction", "entailment", "neutral"]
73 | 
74 | 
75 | xnli_processors = {
76 |     "xnli": XnliProcessor,
77 | }
78 | 
79 | xnli_output_modes = {
80 |     "xnli": "classification",
81 | }
82 | 
83 | xnli_tasks_num_labels = {
84 |     "xnli": 3,
85 | }
86 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | from .configuration_bert import BertConfig
22 | 
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27 |     "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
28 |     "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
29 |     "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
30 |     "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json",
31 |     "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json",
32 |     "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json",
33 | }
34 | 
35 | 
36 | class RobertaConfig(BertConfig):
37 |     r"""
38 |         This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`.
39 |         It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
40 |         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
41 |         the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
42 | 
43 |         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
44 |         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
45 |         for more information.
46 | 
47 |         The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
48 |         It reuses the same defaults. Please check the parent class for more information.
49 | 
50 |         Example::
51 | 
52 |             from transformers import RobertaConfig, RobertaModel
53 | 
54 |             # Initializing a RoBERTa configuration
55 |             configuration = RobertaConfig()
56 | 
57 |             # Initializing a model from the configuration
58 |             model = RobertaModel(configuration)
59 | 
60 |             # Accessing the model configuration
61 |             configuration = model.config
62 | 
63 |         Attributes:
64 |             pretrained_config_archive_map (Dict[str, str]):
65 |                 A dictionary containing all the available pre-trained checkpoints.
66 |     """
67 |     pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
68 |     model_type = "roberta"
69 | 


--------------------------------------------------------------------------------
/src/transformers/tokenization_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for DistilBERT."""
16 | 
17 | 
18 | import logging
19 | 
20 | from .tokenization_bert import BertTokenizer, BertTokenizerFast
21 | 
22 | 
23 | logger = logging.getLogger(__name__)
24 | 
25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
26 | 
27 | PRETRAINED_VOCAB_FILES_MAP = {
28 |     "vocab_file": {
29 |         "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
30 |         "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
31 |         "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
32 |         "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
33 |         "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
34 |         "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
35 |     }
36 | }
37 | 
38 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
39 |     "distilbert-base-uncased": 512,
40 |     "distilbert-base-uncased-distilled-squad": 512,
41 |     "distilbert-base-cased": 512,
42 |     "distilbert-base-cased-distilled-squad": 512,
43 |     "distilbert-base-german-cased": 512,
44 |     "distilbert-base-multilingual-cased": 512,
45 | }
46 | 
47 | 
48 | PRETRAINED_INIT_CONFIGURATION = {
49 |     "distilbert-base-uncased": {"do_lower_case": True},
50 |     "distilbert-base-uncased-distilled-squad": {"do_lower_case": True},
51 |     "distilbert-base-cased": {"do_lower_case": False},
52 |     "distilbert-base-cased-distilled-squad": {"do_lower_case": False},
53 |     "distilbert-base-german-cased": {"do_lower_case": False},
54 |     "distilbert-base-multilingual-cased": {"do_lower_case": False},
55 | }
56 | 
57 | 
58 | class DistilBertTokenizer(BertTokenizer):
59 |     r"""
60 |     Constructs a DistilBertTokenizer.
61 |     :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
62 |     tokenization: punctuation splitting + wordpiece.
63 | 
64 |     Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
65 |     parameters.
66 |     """
67 | 
68 |     vocab_files_names = VOCAB_FILES_NAMES
69 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
70 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
71 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
72 | 
73 | 
74 | class DistilBertTokenizerFast(BertTokenizerFast):
75 |     vocab_files_names = VOCAB_FILES_NAMES
76 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
77 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
78 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
79 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FILTER: An Enhanced Fusion Method for Cross-lingual Language Understanding
 2 | 
 3 | This is the official repository of [FILTER](https://arxiv.org/abs/2009.05166).
 4 | 
 5 | ## Requirements
 6 | We provide Docker image for easier reproduction. Please use `dockers/Dockerfile` or pull image directly.
 7 | ```bash
 8 | docker pull studyfang/multilingual:xtreme
 9 | ```
10 | 
11 | To run docker without sudo permission, please refer this documentation [Manage Docker as a non-root user](https://docs.docker.com/install/linux/linux-postinstall/).
12 | Then, you could start docker, e.g.
13 | ```bash
14 | docker run --gpus all -it -v /path/to/FILTER:/ssd -it studyfang/multilingual:xtreme bash
15 | ```
16 | 
17 | ## Quick Start
18 | 
19 | **NOTE**: Please make sure you have set up the environment correctly. 
20 | 
21 | 1. Download data and our models
22 | 
23 | Please set your `DATA_ROOT` in init.sh, and then run the following command to download specified task and its pretrained FILTER models.
24 | ```bash
25 | bash scripts/download_data.sh ${task}
26 | ```
27 | 
28 | To download all tasks and its pretrained models, please run `bash scripts/download_data.sh` which may take a while.
29 | 
30 | 
31 | 2. Evaluate our pretrained models which are save in `$DATA_ROOT/outputs/phase${idx}/${task}` :
32 | ```bash
33 | bash eval.sh -t ${task} -n phase${idx}/${task}
34 | ```
35 | 
36 | where 
37 | - `idx` could be `1` (without self-teaching) or `2`(+ self-teaching).
38 | - `task` is the name of the task to evaluate from (`[xnli, pawsx, mlqa, tydiqa, xquad, udpos, panx]`)
39 | 
40 | ## Model Training
41 | For QA model training, we use translated training data from XTREME team. Please refere to their [repo](https://github.com/google-research/xtreme) or their [translation](https://console.cloud.google.com/storage/browser/xtreme_translations) directly.
42 | Once your data is ready, simply run the following command to train a FILTER model for supported XTREME tasks:
43 | ```bash
44 | bash train.sh -t ${task} -n ${task}
45 | ```
46 | To use different number of local and fusion layers, you can run this command:
47 | ```bash
48 | bash train.sh -t ${task} -n ${task}_k${k}_m${m} -x "--filter_k ${k} --filter_m ${m}"
49 | ```
50 | 
51 | where 
52 | - `task` is the name of the task to train from (`[xnli, pawsx, mlqa, tydiqa, xquad, udpos, panx]`)
53 | - `k` is the number of fusion layers
54 | - `m` is the number of local layers
55 | 
56 | The output model will be save into `${DATA_ROOT}/outputs/${task}_k${k}_m${m}`.
57 | 
58 | **Note that we ran experiments on 8 V100 GPUs for FILTER models. You may need to increase `gradient_accumulation_steps` if you have less GPUs.**
59 | 
60 | 
61 | ## Citation
62 | If you use this code useful, please star our repo or consider citing:
63 | ```
64 | @article{fang2020filter,
65 |   title={FILTER: An enhanced fusion method for cross-lingual language understanding},
66 |   author={Fang, Yuwei and Wang, Shuohang and Gan, Zhe and Sun, Siqi and Liu, Jingjing},
67 |   journal={arXiv preprint arXiv:2009.05166},
68 |   year={2020}
69 | }
70 | ```
71 | 
72 | ## Contributing
73 | 
74 | This project welcomes contributions and suggestions. Most contributions require you to
75 | agree to a Contributor License Agreement (CLA) declaring that you have the right to,
76 | and actually do, grant us the rights to use your contribution. For details, visit
77 | https://cla.microsoft.com.
78 | 
79 | When you submit a pull request, a CLA-bot will automatically determine whether you need
80 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
81 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
82 | 
83 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
84 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
85 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
86 | 
87 | ## License
88 | 
89 | MIT
90 | 


--------------------------------------------------------------------------------
/src/transformers/commands/run.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from argparse import ArgumentParser
 3 | 
 4 | from transformers.commands import BaseTransformersCLICommand
 5 | from transformers.pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline
 6 | 
 7 | 
 8 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 9 | 
10 | 
11 | def try_infer_format_from_ext(path: str):
12 |     if not path:
13 |         return "pipe"
14 | 
15 |     for ext in PipelineDataFormat.SUPPORTED_FORMATS:
16 |         if path.endswith(ext):
17 |             return ext
18 | 
19 |     raise Exception(
20 |         "Unable to determine file format from file extension {}. "
21 |         "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS)
22 |     )
23 | 
24 | 
25 | def run_command_factory(args):
26 |     nlp = pipeline(
27 |         task=args.task,
28 |         model=args.model if args.model else None,
29 |         config=args.config,
30 |         tokenizer=args.tokenizer,
31 |         device=args.device,
32 |     )
33 |     format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format
34 |     reader = PipelineDataFormat.from_str(
35 |         format=format,
36 |         output_path=args.output,
37 |         input_path=args.input,
38 |         column=args.column if args.column else nlp.default_input_names,
39 |         overwrite=args.overwrite,
40 |     )
41 |     return RunCommand(nlp, reader)
42 | 
43 | 
44 | class RunCommand(BaseTransformersCLICommand):
45 |     def __init__(self, nlp: Pipeline, reader: PipelineDataFormat):
46 |         self._nlp = nlp
47 |         self._reader = reader
48 | 
49 |     @staticmethod
50 |     def register_subcommand(parser: ArgumentParser):
51 |         run_parser = parser.add_parser("run", help="Run a pipeline through the CLI")
52 |         run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run")
53 |         run_parser.add_argument("--input", type=str, help="Path to the file to use for inference")
54 |         run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.")
55 |         run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.")
56 |         run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.")
57 |         run_parser.add_argument(
58 |             "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)"
59 |         )
60 |         run_parser.add_argument(
61 |             "--column",
62 |             type=str,
63 |             help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)",
64 |         )
65 |         run_parser.add_argument(
66 |             "--format",
67 |             type=str,
68 |             default="infer",
69 |             choices=PipelineDataFormat.SUPPORTED_FORMATS,
70 |             help="Input format to read from",
71 |         )
72 |         run_parser.add_argument(
73 |             "--device",
74 |             type=int,
75 |             default=-1,
76 |             help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
77 |         )
78 |         run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.")
79 |         run_parser.set_defaults(func=run_command_factory)
80 | 
81 |     def run(self):
82 |         nlp, outputs = self._nlp, []
83 | 
84 |         for entry in self._reader:
85 |             output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry)
86 |             if isinstance(output, dict):
87 |                 outputs.append(output)
88 |             else:
89 |                 outputs += output
90 | 
91 |         # Saving data
92 |         if self._nlp.binary_output:
93 |             binary_path = self._reader.save_binary(outputs)
94 |             logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path))
95 |         else:
96 |             self._reader.save(outputs)
97 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_bart.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ BART configuration """
 16 | 
 17 | 
 18 | import logging
 19 | 
 20 | from .configuration_utils import PretrainedConfig
 21 | 
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 26 |     "bart-large": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large/config.json",
 27 |     "bart-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-mnli/config.json",
 28 |     "bart-large-cnn": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/config.json",
 29 | }
 30 | 
 31 | 
 32 | class BartConfig(PretrainedConfig):
 33 |     r"""
 34 |         Configuration class for Bart. Parameters are renamed from the fairseq implementation
 35 |     """
 36 |     model_type = "bart"
 37 |     pretrained_config_archive_map = BART_PRETRAINED_CONFIG_ARCHIVE_MAP
 38 | 
 39 |     def __init__(
 40 |         self,
 41 |         activation_dropout=0.0,
 42 |         vocab_size=50265,
 43 |         pad_token_id=1,
 44 |         eos_token_id=2,
 45 |         d_model=1024,
 46 |         encoder_ffn_dim=4096,
 47 |         encoder_layers=12,
 48 |         encoder_attention_heads=16,
 49 |         decoder_ffn_dim=4096,
 50 |         decoder_layers=12,
 51 |         decoder_attention_heads=16,
 52 |         encoder_layerdrop=0.0,
 53 |         decoder_layerdrop=0.0,
 54 |         attention_dropout=0.0,
 55 |         dropout=0.1,
 56 |         max_position_embeddings=1024,
 57 |         init_std=0.02,
 58 |         classifier_dropout=0.0,
 59 |         output_past=False,
 60 |         num_labels=3,
 61 |         bos_token_id=0,
 62 |         **common_kwargs
 63 |     ):
 64 |         r"""
 65 |             :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
 66 |             Examples:
 67 |                 config = BartConfig.from_pretrained('bart-large')
 68 |                 model = BartModel(config)
 69 |         """
 70 |         super().__init__(
 71 |             num_labels=num_labels,
 72 |             output_past=output_past,
 73 |             pad_token_id=pad_token_id,
 74 |             bos_token_id=bos_token_id,
 75 |             **common_kwargs,
 76 |         )
 77 |         self.vocab_size = vocab_size
 78 |         self.d_model = d_model  # encoder_embed_dim and decoder_embed_dim
 79 |         self.eos_token_id = eos_token_id
 80 |         self.encoder_ffn_dim = encoder_ffn_dim
 81 |         self.encoder_layers = self.num_hidden_layers = encoder_layers
 82 |         self.encoder_attention_heads = encoder_attention_heads
 83 |         self.encoder_layerdrop = encoder_layerdrop
 84 |         self.decoder_layerdrop = decoder_layerdrop
 85 |         self.decoder_ffn_dim = decoder_ffn_dim
 86 |         self.decoder_layers = decoder_layers
 87 |         self.decoder_attention_heads = decoder_attention_heads
 88 |         self.max_position_embeddings = max_position_embeddings
 89 |         self.init_std = init_std  # Normal(0, this parameter)
 90 | 
 91 |         # 3 Types of Dropout
 92 |         self.attention_dropout = attention_dropout
 93 |         self.activation_dropout = activation_dropout
 94 |         self.dropout = dropout
 95 | 
 96 |         # Classifier stuff
 97 |         self.classif_dropout = classifier_dropout
 98 | 
 99 |     @property
100 |     def num_attention_heads(self):
101 |         return self.encoder_attention_heads
102 | 
103 |     @property
104 |     def hidden_size(self):
105 |         return self.d_model
106 | 


--------------------------------------------------------------------------------
/src/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert BERT checkpoint."""
 16 | 
 17 | 
 18 | import argparse
 19 | import logging
 20 | import os
 21 | 
 22 | import torch
 23 | 
 24 | from transformers import (
 25 |     CONFIG_NAME,
 26 |     WEIGHTS_NAME,
 27 |     XLNetConfig,
 28 |     XLNetForQuestionAnswering,
 29 |     XLNetForSequenceClassification,
 30 |     XLNetLMHeadModel,
 31 |     load_tf_weights_in_xlnet,
 32 | )
 33 | 
 34 | 
 35 | GLUE_TASKS_NUM_LABELS = {
 36 |     "cola": 2,
 37 |     "mnli": 3,
 38 |     "mrpc": 2,
 39 |     "sst-2": 2,
 40 |     "sts-b": 1,
 41 |     "qqp": 2,
 42 |     "qnli": 2,
 43 |     "rte": 2,
 44 |     "wnli": 2,
 45 | }
 46 | 
 47 | 
 48 | logging.basicConfig(level=logging.INFO)
 49 | 
 50 | 
 51 | def convert_xlnet_checkpoint_to_pytorch(
 52 |     tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None
 53 | ):
 54 |     # Initialise PyTorch model
 55 |     config = XLNetConfig.from_json_file(bert_config_file)
 56 | 
 57 |     finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
 58 |     if finetuning_task in GLUE_TASKS_NUM_LABELS:
 59 |         print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
 60 |         config.finetuning_task = finetuning_task
 61 |         config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
 62 |         model = XLNetForSequenceClassification(config)
 63 |     elif "squad" in finetuning_task:
 64 |         config.finetuning_task = finetuning_task
 65 |         model = XLNetForQuestionAnswering(config)
 66 |     else:
 67 |         model = XLNetLMHeadModel(config)
 68 | 
 69 |     # Load weights from tf checkpoint
 70 |     load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
 71 | 
 72 |     # Save pytorch-model
 73 |     pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
 74 |     pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
 75 |     print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
 76 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
 77 |     print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
 78 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
 79 |         f.write(config.to_json_string())
 80 | 
 81 | 
 82 | if __name__ == "__main__":
 83 |     parser = argparse.ArgumentParser()
 84 |     # Required parameters
 85 |     parser.add_argument(
 86 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
 87 |     )
 88 |     parser.add_argument(
 89 |         "--xlnet_config_file",
 90 |         default=None,
 91 |         type=str,
 92 |         required=True,
 93 |         help="The config json file corresponding to the pre-trained XLNet model. \n"
 94 |         "This specifies the model architecture.",
 95 |     )
 96 |     parser.add_argument(
 97 |         "--pytorch_dump_folder_path",
 98 |         default=None,
 99 |         type=str,
100 |         required=True,
101 |         help="Path to the folder to store the PyTorch model or dataset/vocab.",
102 |     )
103 |     parser.add_argument(
104 |         "--finetuning_task",
105 |         default=None,
106 |         type=str,
107 |         help="Name of a task on which the XLNet TensorFloaw model was fine-tuned",
108 |     )
109 |     args = parser.parse_args()
110 |     print(args)
111 | 
112 |     convert_xlnet_checkpoint_to_pytorch(
113 |         args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task
114 |     )
115 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_t5.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2010, The T5 Authors and HuggingFace Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ T5 model configuration """
 16 | 
 17 | 
 18 | import logging
 19 | 
 20 | from .configuration_utils import PretrainedConfig
 21 | 
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 26 |     "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
 27 |     "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
 28 |     "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
 29 |     "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
 30 |     "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
 31 | }
 32 | 
 33 | 
 34 | class T5Config(PretrainedConfig):
 35 |     r"""
 36 |         :class:`~transformers.T5Config` is the configuration class to store the configuration of a
 37 |         `T5Model`.
 38 | 
 39 | 
 40 |         Arguments:
 41 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
 42 |             hidden_size: Size of the encoder layers and the pooler layer.
 43 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
 44 |             num_attention_heads: Number of attention heads for each attention layer in
 45 |                 the Transformer encoder.
 46 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 47 |                 layer in the Transformer encoder.
 48 |             hidden_act: The non-linear activation function (function or string) in the
 49 |                 encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
 50 |             hidden_dropout_prob: The dropout probabilitiy for all fully connected
 51 |                 layers in the embeddings, encoder, and pooler.
 52 |             attention_probs_dropout_prob: The dropout ratio for the attention
 53 |                 probabilities.
 54 |             max_position_embeddings: The maximum sequence length that this model might
 55 |                 ever be used with. Typically set this to something large just in case
 56 |                 (e.g., 512 or 1024 or 2048).
 57 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 58 |                 `T5Model`.
 59 |             initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
 60 |             layer_norm_eps: The epsilon used by LayerNorm.
 61 |     """
 62 |     pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
 63 |     model_type = "t5"
 64 | 
 65 |     def __init__(
 66 |         self,
 67 |         vocab_size=32128,
 68 |         n_positions=512,
 69 |         d_model=512,
 70 |         d_kv=64,
 71 |         d_ff=2048,
 72 |         num_layers=6,
 73 |         num_heads=8,
 74 |         relative_attention_num_buckets=32,
 75 |         dropout_rate=0.1,
 76 |         layer_norm_epsilon=1e-6,
 77 |         initializer_factor=1.0,
 78 |         **kwargs
 79 |     ):
 80 |         super().__init__(**kwargs)
 81 |         self.vocab_size = vocab_size
 82 |         self.n_positions = n_positions
 83 |         self.d_model = d_model
 84 |         self.d_kv = d_kv
 85 |         self.d_ff = d_ff
 86 |         self.num_layers = num_layers
 87 |         self.num_heads = num_heads
 88 |         self.relative_attention_num_buckets = relative_attention_num_buckets
 89 |         self.dropout_rate = dropout_rate
 90 |         self.layer_norm_epsilon = layer_norm_epsilon
 91 |         self.initializer_factor = initializer_factor
 92 | 
 93 |     @property
 94 |     def max_position_embeddings(self):
 95 |         return self.n_positions
 96 | 
 97 |     @property
 98 |     def hidden_size(self):
 99 |         return self.d_model
100 | 
101 |     @property
102 |     def num_attention_heads(self):
103 |         return self.num_heads
104 | 
105 |     @property
106 |     def num_hidden_layers(self):
107 |         return self.num_layers
108 | 


--------------------------------------------------------------------------------
/src/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""
 17 | 
 18 | import argparse
 19 | import os
 20 | 
 21 | import numpy as np
 22 | import tensorflow as tf
 23 | import torch
 24 | 
 25 | from transformers import BertModel
 26 | 
 27 | 
 28 | def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):
 29 | 
 30 |     """
 31 |     :param model:BertModel Pytorch model instance to be converted
 32 |     :param ckpt_dir: Tensorflow model directory
 33 |     :param model_name: model name
 34 |     :return:
 35 | 
 36 |     Currently supported HF models:
 37 |         Y BertModel
 38 |         N BertForMaskedLM
 39 |         N BertForPreTraining
 40 |         N BertForMultipleChoice
 41 |         N BertForNextSentencePrediction
 42 |         N BertForSequenceClassification
 43 |         N BertForQuestionAnswering
 44 |     """
 45 | 
 46 |     tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value")
 47 | 
 48 |     var_map = (
 49 |         ("layer.", "layer_"),
 50 |         ("word_embeddings.weight", "word_embeddings"),
 51 |         ("position_embeddings.weight", "position_embeddings"),
 52 |         ("token_type_embeddings.weight", "token_type_embeddings"),
 53 |         (".", "/"),
 54 |         ("LayerNorm/weight", "LayerNorm/gamma"),
 55 |         ("LayerNorm/bias", "LayerNorm/beta"),
 56 |         ("weight", "kernel"),
 57 |     )
 58 | 
 59 |     if not os.path.isdir(ckpt_dir):
 60 |         os.makedirs(ckpt_dir)
 61 | 
 62 |     state_dict = model.state_dict()
 63 | 
 64 |     def to_tf_var_name(name: str):
 65 |         for patt, repl in iter(var_map):
 66 |             name = name.replace(patt, repl)
 67 |         return "bert/{}".format(name)
 68 | 
 69 |     def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
 70 |         tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
 71 |         tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
 72 |         session.run(tf.variables_initializer([tf_var]))
 73 |         session.run(tf_var)
 74 |         return tf_var
 75 | 
 76 |     tf.reset_default_graph()
 77 |     with tf.Session() as session:
 78 |         for var_name in state_dict:
 79 |             tf_name = to_tf_var_name(var_name)
 80 |             torch_tensor = state_dict[var_name].numpy()
 81 |             if any([x in var_name for x in tensors_to_transpose]):
 82 |                 torch_tensor = torch_tensor.T
 83 |             tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
 84 |             tf.keras.backend.set_value(tf_var, torch_tensor)
 85 |             tf_weight = session.run(tf_var)
 86 |             print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
 87 | 
 88 |         saver = tf.train.Saver(tf.trainable_variables())
 89 |         saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
 90 | 
 91 | 
 92 | def main(raw_args=None):
 93 |     parser = argparse.ArgumentParser()
 94 |     parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased")
 95 |     parser.add_argument(
 96 |         "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model"
 97 |     )
 98 |     parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/<pytorch-model-name>.bin")
 99 |     parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model")
100 |     args = parser.parse_args(raw_args)
101 | 
102 |     model = BertModel.from_pretrained(
103 |         pretrained_model_name_or_path=args.model_name,
104 |         state_dict=torch.load(args.pytorch_model_path),
105 |         cache_dir=args.cache_dir,
106 |     )
107 | 
108 |     convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name)
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     main()
113 | 


--------------------------------------------------------------------------------
/src/transformers/convert_bart_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert BART checkpoint."""
 16 | 
 17 | 
 18 | import argparse
 19 | import logging
 20 | from pathlib import Path
 21 | 
 22 | import fairseq
 23 | import torch
 24 | from packaging import version
 25 | 
 26 | from transformers import BartConfig, BartForMaskedLM, BartForSequenceClassification, BartModel, BartTokenizer
 27 | 
 28 | 
 29 | FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn"]
 30 | 
 31 | if version.parse(fairseq.__version__) < version.parse("0.9.0"):
 32 |     raise Exception("requires fairseq >= 0.9.0")
 33 | 
 34 | 
 35 | logging.basicConfig(level=logging.INFO)
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | SAMPLE_TEXT = " Hello world! cécé herlolip"
 39 | 
 40 | rename_keys = [
 41 |     ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"),
 42 |     ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"),
 43 |     ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"),
 44 |     ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"),
 45 | ]
 46 | IGNORE_KEYS = ["encoder.version", "decoder.version", "model.encoder.version", "model.decoder.version", "_float_tensor"]
 47 | 
 48 | 
 49 | def rename_key(dct, old, new):
 50 |     val = dct.pop(old)
 51 |     dct[new] = val
 52 | 
 53 | 
 54 | def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path):
 55 |     """
 56 |     Copy/paste/tweak model's weights to our BERT structure.
 57 |     """
 58 |     bart = torch.hub.load("pytorch/fairseq", checkpoint_path)
 59 |     bart.eval()  # disable dropout
 60 |     bart.model.upgrade_state_dict(bart.model.state_dict())
 61 |     hf_model_name = checkpoint_path.replace(".", "-")
 62 |     config = BartConfig.from_pretrained(hf_model_name)
 63 |     tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
 64 |     tokens2 = BartTokenizer.from_pretrained(hf_model_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
 65 |     assert torch.eq(tokens, tokens2).all()
 66 | 
 67 |     if checkpoint_path in ["bart.large", "bart.large.cnn"]:
 68 |         state_dict = bart.model.state_dict()
 69 |         for k in IGNORE_KEYS:
 70 |             state_dict.pop(k, None)
 71 |         state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
 72 |         model = BartModel(config)
 73 |         their_output = bart.extract_features(tokens)
 74 |     else:  # MNLI Case
 75 |         state_dict = bart.state_dict()
 76 |         for k in IGNORE_KEYS:
 77 |             state_dict.pop(k, None)
 78 |         state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
 79 |         for src, dest in rename_keys:
 80 |             rename_key(state_dict, src, dest)
 81 |         model = BartForSequenceClassification(config)
 82 |         their_output = bart.predict("mnli", tokens, return_logits=True)
 83 | 
 84 |     # Load state dict
 85 |     model.load_state_dict(state_dict)
 86 |     model.eval()
 87 |     # Check results
 88 | 
 89 |     if checkpoint_path == "bart.large.cnn":  # generate doesnt work yet
 90 |         model = BartForMaskedLM(config, base_model=model)
 91 |         assert "lm_head.weight" in model.state_dict()
 92 |         assert model.lm_head.out_features == config.max_position_embeddings
 93 |         model.eval()
 94 |         our_outputs = model.model.forward(tokens)[0]
 95 |     else:
 96 |         our_outputs = model.forward(tokens)[0]
 97 |     assert their_output.shape == our_outputs.shape
 98 |     assert (their_output == our_outputs).all().item()
 99 |     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
100 |     model.save_pretrained(pytorch_dump_folder_path)
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     parser = argparse.ArgumentParser()
105 |     # Required parameters
106 |     parser.add_argument("fairseq_path", choices=FAIRSEQ_MODELS, type=str, help="")
107 | 
108 |     parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
109 |     args = parser.parse_args()
110 |     convert_bart_checkpoint(
111 |         args.fairseq_path, args.pytorch_dump_folder_path,
112 |     )
113 | 


--------------------------------------------------------------------------------
/src/transformers/modeling_tf_camembert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ TF 2.0 RoBERTa model. """
 17 | 
 18 | 
 19 | import logging
 20 | 
 21 | from .configuration_camembert import CamembertConfig
 22 | from .file_utils import add_start_docstrings
 23 | from .modeling_tf_roberta import (
 24 |     TFRobertaForMaskedLM,
 25 |     TFRobertaForSequenceClassification,
 26 |     TFRobertaForTokenClassification,
 27 |     TFRobertaModel,
 28 | )
 29 | 
 30 | 
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {}
 34 | 
 35 | 
 36 | CAMEMBERT_START_DOCSTRING = r"""
 37 | 
 38 |     .. note::
 39 | 
 40 |         TF 2.0 models accepts two formats as inputs:
 41 | 
 42 |             - having all inputs as keyword arguments (like PyTorch models), or
 43 |             - having all inputs as a list, tuple or dict in the first positional arguments.
 44 | 
 45 |         This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
 46 |         all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
 47 | 
 48 |         If you choose this second option, there are three possibilities you can use to gather all the input Tensors
 49 |         in the first positional argument :
 50 | 
 51 |         - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
 52 |         - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
 53 |           :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
 54 |         - a dictionary with one or several input Tensors associated to the input names given in the docstring:
 55 |           :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 56 | 
 57 |     Parameters:
 58 |         config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the
 59 |             model. Initializing with a config file does not load the weights associated with the model, only the configuration.
 60 |             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 61 | """
 62 | 
 63 | 
 64 | @add_start_docstrings(
 65 |     "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
 66 |     CAMEMBERT_START_DOCSTRING,
 67 | )
 68 | class TFCamembertModel(TFRobertaModel):
 69 |     """
 70 |     This class overrides :class:`~transformers.TFRobertaModel`. Please check the
 71 |     superclass for the appropriate documentation alongside usage examples.
 72 |     """
 73 | 
 74 |     config_class = CamembertConfig
 75 |     pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 76 | 
 77 | 
 78 | @add_start_docstrings(
 79 |     """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
 80 | )
 81 | class TFCamembertForMaskedLM(TFRobertaForMaskedLM):
 82 |     """
 83 |     This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the
 84 |     superclass for the appropriate documentation alongside usage examples.
 85 |     """
 86 | 
 87 |     config_class = CamembertConfig
 88 |     pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 89 | 
 90 | 
 91 | @add_start_docstrings(
 92 |     """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer
 93 |     on top of the pooled output) e.g. for GLUE tasks. """,
 94 |     CAMEMBERT_START_DOCSTRING,
 95 | )
 96 | class TFCamembertForSequenceClassification(TFRobertaForSequenceClassification):
 97 |     """
 98 |     This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the
 99 |     superclass for the appropriate documentation alongside usage examples.
100 |     """
101 | 
102 |     config_class = CamembertConfig
103 |     pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
104 | 
105 | 
106 | @add_start_docstrings(
107 |     """CamemBERT Model with a token classification head on top (a linear layer on top of
108 |     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
109 |     CAMEMBERT_START_DOCSTRING,
110 | )
111 | class TFCamembertForTokenClassification(TFRobertaForTokenClassification):
112 |     """
113 |     This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the
114 |     superclass for the appropriate documentation alongside usage examples.
115 |     """
116 | 
117 |     config_class = CamembertConfig
118 |     pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
119 | 


--------------------------------------------------------------------------------
/src/transformers/modeling_tf_xlm_roberta.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ TF 2.0  XLM-RoBERTa model. """
 17 | 
 18 | 
 19 | import logging
 20 | 
 21 | from .configuration_xlm_roberta import XLMRobertaConfig
 22 | from .file_utils import add_start_docstrings
 23 | from .modeling_tf_roberta import (
 24 |     TFRobertaForMaskedLM,
 25 |     TFRobertaForSequenceClassification,
 26 |     TFRobertaForTokenClassification,
 27 |     TFRobertaModel,
 28 | )
 29 | 
 30 | 
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {}
 34 | 
 35 | 
 36 | XLM_ROBERTA_START_DOCSTRING = r"""
 37 | 
 38 |     .. note::
 39 | 
 40 |         TF 2.0 models accepts two formats as inputs:
 41 | 
 42 |             - having all inputs as keyword arguments (like PyTorch models), or
 43 |             - having all inputs as a list, tuple or dict in the first positional arguments.
 44 | 
 45 |         This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
 46 |         all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
 47 | 
 48 |         If you choose this second option, there are three possibilities you can use to gather all the input Tensors
 49 |         in the first positional argument :
 50 | 
 51 |         - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
 52 |         - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
 53 |           :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
 54 |         - a dictionary with one or several input Tensors associated to the input names given in the docstring:
 55 |           :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 56 | 
 57 |     Parameters:
 58 |         config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
 59 |             model. Initializing with a config file does not load the weights associated with the model, only the configuration.
 60 |             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 61 | """
 62 | 
 63 | 
 64 | @add_start_docstrings(
 65 |     "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
 66 |     XLM_ROBERTA_START_DOCSTRING,
 67 | )
 68 | class TFXLMRobertaModel(TFRobertaModel):
 69 |     """
 70 |     This class overrides :class:`~transformers.TFRobertaModel`. Please check the
 71 |     superclass for the appropriate documentation alongside usage examples.
 72 |     """
 73 | 
 74 |     config_class = XLMRobertaConfig
 75 |     pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 76 | 
 77 | 
 78 | @add_start_docstrings(
 79 |     """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING,
 80 | )
 81 | class TFXLMRobertaForMaskedLM(TFRobertaForMaskedLM):
 82 |     """
 83 |     This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the
 84 |     superclass for the appropriate documentation alongside usage examples.
 85 |     """
 86 | 
 87 |     config_class = XLMRobertaConfig
 88 |     pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 89 | 
 90 | 
 91 | @add_start_docstrings(
 92 |     """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
 93 |     on top of the pooled output) e.g. for GLUE tasks. """,
 94 |     XLM_ROBERTA_START_DOCSTRING,
 95 | )
 96 | class TFXLMRobertaForSequenceClassification(TFRobertaForSequenceClassification):
 97 |     """
 98 |     This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the
 99 |     superclass for the appropriate documentation alongside usage examples.
100 |     """
101 | 
102 |     config_class = XLMRobertaConfig
103 |     pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
104 | 
105 | 
106 | @add_start_docstrings(
107 |     """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of
108 |     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
109 |     XLM_ROBERTA_START_DOCSTRING,
110 | )
111 | class TFXLMRobertaForTokenClassification(TFRobertaForTokenClassification):
112 |     """
113 |     This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the
114 |     superclass for the appropriate documentation alongside usage examples.
115 |     """
116 | 
117 |     config_class = XLMRobertaConfig
118 |     pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
119 | 


--------------------------------------------------------------------------------
/src/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert Transformer XL checkpoint and datasets."""
 16 | 
 17 | 
 18 | import argparse
 19 | import logging
 20 | import os
 21 | import pickle
 22 | import sys
 23 | 
 24 | import torch
 25 | 
 26 | import transformers.tokenization_transfo_xl as data_utils
 27 | from transformers import (
 28 |     CONFIG_NAME,
 29 |     WEIGHTS_NAME,
 30 |     TransfoXLConfig,
 31 |     TransfoXLLMHeadModel,
 32 |     load_tf_weights_in_transfo_xl,
 33 | )
 34 | from transformers.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES
 35 | 
 36 | 
 37 | logging.basicConfig(level=logging.INFO)
 38 | 
 39 | # We do this to be able to load python 2 datasets pickles
 40 | # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
 41 | data_utils.Vocab = data_utils.TransfoXLTokenizer
 42 | data_utils.Corpus = data_utils.TransfoXLCorpus
 43 | sys.modules["data_utils"] = data_utils
 44 | sys.modules["vocabulary"] = data_utils
 45 | 
 46 | 
 47 | def convert_transfo_xl_checkpoint_to_pytorch(
 48 |     tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file
 49 | ):
 50 |     if transfo_xl_dataset_file:
 51 |         # Convert a pre-processed corpus (see original TensorFlow repo)
 52 |         with open(transfo_xl_dataset_file, "rb") as fp:
 53 |             corpus = pickle.load(fp, encoding="latin1")
 54 |         # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
 55 |         pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"]
 56 |         print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
 57 |         corpus_vocab_dict = corpus.vocab.__dict__
 58 |         torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
 59 | 
 60 |         corpus_dict_no_vocab = corpus.__dict__
 61 |         corpus_dict_no_vocab.pop("vocab", None)
 62 |         pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME
 63 |         print("Save dataset to {}".format(pytorch_dataset_dump_path))
 64 |         torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
 65 | 
 66 |     if tf_checkpoint_path:
 67 |         # Convert a pre-trained TensorFlow model
 68 |         config_path = os.path.abspath(transfo_xl_config_file)
 69 |         tf_path = os.path.abspath(tf_checkpoint_path)
 70 | 
 71 |         print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
 72 |         # Initialise PyTorch model
 73 |         if transfo_xl_config_file == "":
 74 |             config = TransfoXLConfig()
 75 |         else:
 76 |             config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
 77 |         print("Building PyTorch model from configuration: {}".format(str(config)))
 78 |         model = TransfoXLLMHeadModel(config)
 79 | 
 80 |         model = load_tf_weights_in_transfo_xl(model, config, tf_path)
 81 |         # Save pytorch-model
 82 |         pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
 83 |         pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
 84 |         print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
 85 |         torch.save(model.state_dict(), pytorch_weights_dump_path)
 86 |         print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
 87 |         with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
 88 |             f.write(config.to_json_string())
 89 | 
 90 | 
 91 | if __name__ == "__main__":
 92 |     parser = argparse.ArgumentParser()
 93 |     parser.add_argument(
 94 |         "--pytorch_dump_folder_path",
 95 |         default=None,
 96 |         type=str,
 97 |         required=True,
 98 |         help="Path to the folder to store the PyTorch model or dataset/vocab.",
 99 |     )
100 |     parser.add_argument(
101 |         "--tf_checkpoint_path",
102 |         default="",
103 |         type=str,
104 |         help="An optional path to a TensorFlow checkpoint path to be converted.",
105 |     )
106 |     parser.add_argument(
107 |         "--transfo_xl_config_file",
108 |         default="",
109 |         type=str,
110 |         help="An optional config json file corresponding to the pre-trained BERT model. \n"
111 |         "This specifies the model architecture.",
112 |     )
113 |     parser.add_argument(
114 |         "--transfo_xl_dataset_file",
115 |         default="",
116 |         type=str,
117 |         help="An optional dataset file to be converted in a vocabulary.",
118 |     )
119 |     args = parser.parse_args()
120 |     convert_transfo_xl_checkpoint_to_pytorch(
121 |         args.tf_checkpoint_path,
122 |         args.transfo_xl_config_file,
123 |         args.pytorch_dump_folder_path,
124 |         args.transfo_xl_dataset_file,
125 |     )
126 | 


--------------------------------------------------------------------------------
/src/transformers/data/metrics/__init__.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | try:
 18 |     from scipy.stats import pearsonr, spearmanr
 19 |     from sklearn.metrics import matthews_corrcoef, f1_score, average_precision_score, ndcg_score, roc_auc_score
 20 |     import numpy as np
 21 |     _has_sklearn = True
 22 | except (AttributeError, ImportError):
 23 |     _has_sklearn = False
 24 | 
 25 | 
 26 | def is_sklearn_available():
 27 |     return _has_sklearn
 28 | 
 29 | 
 30 | if _has_sklearn:
 31 | 
 32 |     def simple_accuracy(preds, labels):
 33 |         return (preds == labels).mean()
 34 | 
 35 |     def simple_ndcg(preds, labels, guids):
 36 |         ndcgs = []
 37 |         query2content = {}
 38 |         for guid, pred, label in zip(guids, preds, labels):
 39 |             query = guid.split("_")[0]
 40 |             if not query in query2content:
 41 |                 query2content[query] = [[int(pred)], [int(label)]]
 42 |             else:
 43 |                 query2content[query][0].append(int(pred))     
 44 |                 query2content[query][1].append(int(label))     
 45 |  
 46 |         for key in query2content.keys():
 47 |             if len(query2content[key][1]) < 2 or len(query2content[key][0]) < 2:
 48 |                 continue 
 49 |             ndcgs.append(ndcg_score(np.asarray([query2content[key][1]]), np.asarray([query2content[key][0]])))
 50 |         return {"ndcg" : np.array(ndcgs).mean()}
 51 | 
 52 |     def acc_and_f1(preds, labels):
 53 |         acc = simple_accuracy(preds, labels)
 54 |         f1 = f1_score(y_true=labels, y_pred=preds)
 55 |         return {
 56 |             "acc": acc,
 57 |             "f1": f1,
 58 |             "acc_and_f1": (acc + f1) / 2,
 59 |         }
 60 | 
 61 |     def acc_and_auc(preds, labels):   # auc of pr curve is equal to average precision
 62 |         acc = simple_accuracy(preds, labels)
 63 |         auc = average_precision_score(labels, preds)
 64 |         return {
 65 |             "acc": acc,
 66 |             "auc": auc,
 67 |             "acc_and_auc": (acc + auc) / 2,
 68 |         }
 69 | 
 70 |     def acc_and_roc_auc(preds, labels):   # auc of pr curve is equal to average precision
 71 |         acc = simple_accuracy(preds, labels)
 72 |         roc_auc = roc_auc_score(labels, preds)
 73 |         return {
 74 |             "acc": acc,
 75 |             "roc_auc": roc_auc,
 76 |             "acc_and_roc_auc": (acc + roc_auc) / 2,
 77 |         }
 78 | 
 79 |     def pearson_and_spearman(preds, labels):
 80 |         pearson_corr = pearsonr(preds, labels)[0]
 81 |         spearman_corr = spearmanr(preds, labels)[0]
 82 |         return {
 83 |             "pearson": pearson_corr,
 84 |             "spearmanr": spearman_corr,
 85 |             "corr": (pearson_corr + spearman_corr) / 2,
 86 |         }
 87 | 
 88 |     def xglue_compute_metrics(task_name, preds, labels, guids):
 89 |         assert len(preds) == len(labels)
 90 |         if task_name == "xnli":
 91 |             return {"acc": simple_accuracy(preds, labels)}
 92 |         elif task_name == "pawsx":
 93 |             return acc_and_auc(preds, labels)
 94 |         elif task_name == "qam":
 95 |             return acc_and_auc(preds, labels)
 96 |         elif task_name == "ads":
 97 |             return acc_and_roc_auc(preds, labels)
 98 |         elif task_name == "rel":
 99 |             return simple_ndcg(preds, labels, guids)
100 |         elif task_name == "news":
101 |             return {"acc": simple_accuracy(preds, labels)}
102 |         else:
103 |             raise KeyError(task_name)
104 | 
105 | 
106 |     def glue_compute_metrics(task_name, preds, labels):
107 |         assert len(preds) == len(labels)
108 |         if task_name == "cola":
109 |             return {"mcc": matthews_corrcoef(labels, preds)}
110 |         elif task_name == "sst-2":
111 |             return {"acc": simple_accuracy(preds, labels)}
112 |         elif task_name == "mrpc":
113 |             return acc_and_f1(preds, labels)
114 |         elif task_name == "sts-b":
115 |             return pearson_and_spearman(preds, labels)
116 |         elif task_name == "qqp":
117 |             return acc_and_f1(preds, labels)
118 |         elif task_name == "mnli":
119 |             return {"acc": simple_accuracy(preds, labels)}
120 |         elif task_name == "mnli-mm":
121 |             return {"acc": simple_accuracy(preds, labels)}
122 |         elif task_name == "qnli":
123 |             return {"acc": simple_accuracy(preds, labels)}
124 |         elif task_name == "rte":
125 |             return {"acc": simple_accuracy(preds, labels)}
126 |         elif task_name == "wnli":
127 |             return {"acc": simple_accuracy(preds, labels)}
128 |         elif task_name == "hans":
129 |             return {"acc": simple_accuracy(preds, labels)}
130 |         else:
131 |             raise KeyError(task_name)
132 | 
133 |     def xnli_compute_metrics(task_name, preds, labels):
134 |         assert len(preds) == len(labels)
135 |         if task_name == "xnli":
136 |             return {"acc": simple_accuracy(preds, labels)}
137 |         else:
138 |             raise KeyError(task_name)
139 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py
  3 | 
  4 | To create the package for pypi.
  5 | 
  6 | 1. Change the version in __init__.py, setup.py as well as docs/source/conf.py.
  7 | 
  8 | 2. Commit these changes with the message: "Release: VERSION"
  9 | 
 10 | 3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' "
 11 |    Push the tag to git: git push --tags origin master
 12 | 
 13 | 4. Build both the sources and the wheel. Do not change anything in setup.py between
 14 |    creating the wheel and the source distribution (obviously).
 15 | 
 16 |    For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
 17 |    (this will build a wheel for the python version you use to build it).
 18 | 
 19 |    For the sources, run: "python setup.py sdist"
 20 |    You should now have a /dist directory with both .whl and .tar.gz source versions.
 21 | 
 22 | 5. Check that everything looks correct by uploading the package to the pypi test server:
 23 | 
 24 |    twine upload dist/* -r pypitest
 25 |    (pypi suggest using twine as other methods upload files via plaintext.)
 26 |    You may have to specify the repository url, use the following command then:
 27 |    twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
 28 | 
 29 |    Check that you can install it in a virtualenv by running:
 30 |    pip install -i https://testpypi.python.org/pypi transformers
 31 | 
 32 | 6. Upload the final version to actual pypi:
 33 |    twine upload dist/* -r pypi
 34 | 
 35 | 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
 36 | 
 37 | 8. Update the documentation commit in .circleci/deploy.sh for the accurate documentation to be displayed
 38 | 
 39 | 9. Update README.md to redirect to correct documentation.
 40 | """
 41 | 
 42 | import shutil
 43 | from pathlib import Path
 44 | 
 45 | from setuptools import find_packages, setup
 46 | 
 47 | 
 48 | # Remove stale transformers.egg-info directory to avoid https://github.com/pypa/pip/issues/5466
 49 | stale_egg_info = Path(__file__).parent / "transformers.egg-info"
 50 | if stale_egg_info.exists():
 51 |     print(
 52 |         (
 53 |             "Warning: {} exists.\n\n"
 54 |             "If you recently updated transformers to 3.0 or later, this is expected,\n"
 55 |             "but it may prevent transformers from installing in editable mode.\n\n"
 56 |             "This directory is automatically generated by Python's packaging tools.\n"
 57 |             "I will remove it now.\n\n"
 58 |             "See https://github.com/pypa/pip/issues/5466 for details.\n"
 59 |         ).format(stale_egg_info)
 60 |     )
 61 |     shutil.rmtree(stale_egg_info)
 62 | 
 63 | 
 64 | extras = {}
 65 | 
 66 | extras["mecab"] = ["mecab-python3"]
 67 | extras["sklearn"] = ["scikit-learn==0.22.1"]
 68 | extras["tf"] = ["tensorflow"]
 69 | extras["tf-cpu"] = ["tensorflow-cpu"]
 70 | extras["torch"] = ["torch"]
 71 | 
 72 | extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
 73 | extras["all"] = extras["serving"] + ["tensorflow", "torch"]
 74 | 
 75 | extras["testing"] = ["pytest", "pytest-xdist"]
 76 | extras["quality"] = ["black", "isort", "flake8"]
 77 | extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme"]
 78 | extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow", "torch"]
 79 | 
 80 | setup(
 81 |     name="transformers",
 82 |     version="2.5.1",
 83 |     author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
 84 |     author_email="thomas@huggingface.co",
 85 |     description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
 86 |     long_description=open("README.md", "r", encoding="utf-8").read(),
 87 |     long_description_content_type="text/markdown",
 88 |     keywords="NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU",
 89 |     license="Apache",
 90 |     url="https://github.com/huggingface/transformers",
 91 |     package_dir={"": "src"},
 92 |     packages=find_packages("src"),
 93 |     install_requires=[
 94 |         "numpy",
 95 |         "tokenizers == 0.5.2",
 96 |         # accessing files from S3 directly
 97 |         "boto3",
 98 |         # filesystem locks e.g. to prevent parallel downloads
 99 |         "filelock",
100 |         # for downloading models over HTTPS
101 |         "requests",
102 |         # progress bars in model download and training scripts
103 |         "tqdm >= 4.27",
104 |         # for OpenAI GPT
105 |         "regex != 2019.12.17",
106 |         # for XLNet
107 |         "sentencepiece == 0.1.92",
108 |         # for XLM
109 |         "sacremoses",
110 |         # for ndcg
111 |         "scikit-learn == 0.22",
112 |         # for tensorboard
113 |         "tensorboardX",
114 |         # for ner
115 |         "seqeval",
116 |         # for torch
117 |         "torch",
118 |     ],
119 |     extras_require=extras,
120 |     scripts=["transformers-cli"],
121 |     python_requires=">=3.5.0",
122 |     classifiers=[
123 |         "Development Status :: 5 - Production/Stable",
124 |         "Intended Audience :: Developers",
125 |         "Intended Audience :: Education",
126 |         "Intended Audience :: Science/Research",
127 |         "License :: OSI Approved :: Apache Software License",
128 |         "Operating System :: OS Independent",
129 |         "Programming Language :: Python :: 3",
130 |         "Programming Language :: Python :: 3.5",
131 |         "Programming Language :: Python :: 3.6",
132 |         "Programming Language :: Python :: 3.7",
133 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
134 |     ],
135 | )
136 | 


--------------------------------------------------------------------------------
/eval.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | source init.sh
  4 | 
  5 | MODEL_NAME_OR_PATH='xlm-roberta-large'
  6 | 
  7 | usage()
  8 | {
  9 | cat << EOF
 10 | usage: $0 options
 11 | OPTIONS:
 12 |         -h      Show the help and exit
 13 |         -n      Experiment name to evaluate
 14 |         -m      Pretrained model name or path
 15 |         -t      task to evaluate
 16 |         -x      For convinent usage
 17 | EOF
 18 | }
 19 | 
 20 | while getopts "h:d:m:n:t:x:k:" opt
 21 | do
 22 |         case $opt in
 23 |             h)
 24 |                 usage
 25 |                 exit 1
 26 |                 ;;
 27 | 		        n)
 28 | 			          EXP_NAME=$OPTARG
 29 | 			          ;;
 30 | 		        m)
 31 | 			          MODEL_NAME_OR_PATH=$OPTARG
 32 | 			          ;;
 33 | 		        t)
 34 | 			          TASK=$OPTARG
 35 | 			          ;;
 36 |             x)
 37 |                 OTHER_ARGS=$OPTARG
 38 |                 ;;
 39 |         esac
 40 | done
 41 | 
 42 | DATA_DIR=$DATA_ROOT/data_raw
 43 | if [[ ! -d $DATA_DIR ]]; then
 44 | 	echo "$DATA_DIR not exist"
 45 | 	exit 1
 46 | fi
 47 | 
 48 | OUTPUT_DIR=$DATA_ROOT/outputs/$EXP_NAME
 49 | if [[ ! -d $OUTPUT_DIR ]]; then
 50 | 	  echo "$OUTPUT_DIR not exist, please specify it"
 51 | 	  exit 1
 52 | fi
 53 | 
 54 | xnli() {
 55 | python ./examples/run_xcls.py \
 56 |        --task_name xnli \
 57 |        --model_type filter \
 58 |        --data_dir $DATA_DIR/xnli \
 59 |        --model_name_or_path $MODEL_NAME_OR_PATH \
 60 |        --language ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh \
 61 |        --train_language en \
 62 |        --do_eval \
 63 |        --eval_splits 'valid' \
 64 |        --max_seq_length 256 \
 65 |        --output_dir $OUTPUT_DIR \
 66 |        --per_gpu_eval_batch_size 64 \
 67 |        --filter_m 1 --filter_k 1 \
 68 |        ${OTHER_ARGS}
 69 | }
 70 | 
 71 | pawsx() {
 72 | python ./examples/run_xcls.py \
 73 |        --task_name pawsx \
 74 |        --data_dir $DATA_DIR/pawsx \
 75 |        --model_type filter \
 76 |        --model_name_or_path $MODEL_NAME_OR_PATH \
 77 |        --language de,en,es,fr,ja,ko,zh \
 78 |        --train_language en \
 79 |        --do_eval \
 80 |        --eval_splits valid \
 81 |        --max_seq_length 256 \
 82 |        --output_dir $OUTPUT_DIR \
 83 |        --per_gpu_eval_batch_size 64 \
 84 |        --filter_m 1 --filter_k 1 \
 85 |        ${OTHER_ARGS}
 86 | }
 87 | 
 88 | # mlqa and xquad share the same training set
 89 | mlqa() {
 90 | python ./examples/run_xqa.py \
 91 |        --task_name mlqa \
 92 |        --data_dir $DATA_DIR \
 93 |        --model_type filter \
 94 |        --model_name_or_path $MODEL_NAME_OR_PATH \
 95 |        --language en,es,de,ar,hi,vi,zh \
 96 |        --train_language en \
 97 |        --do_eval \
 98 |        --eval_splits dev \
 99 |        --do_lower_case \
100 |        --per_gpu_eval_batch_size 64 \
101 |        --max_seq_length 384 \
102 |        --doc_stride 128  \
103 |        --output_dir $OUTPUT_DIR \
104 |        --threads 8 \
105 |        --filter_m 1 --filter_k 20 \
106 |        ${OTHER_ARGS}
107 | }
108 | 
109 | xquad() {
110 |     python ./examples/run_xqa.py \
111 |            --task_name xquad \
112 |            --model_type filter \
113 |            --model_name_or_path $MODEL_NAME_OR_PATH \
114 |            --do_eval \
115 |            --eval_splits 'test' \
116 |            --do_lower_case \
117 |            --language ar,de,el,en,es,hi,ru,th,tr,vi,zh \
118 |            --train_language en \
119 |            --data_dir $DATA_DIR \
120 |            --per_gpu_eval_batch_size 64 \
121 |            --max_seq_length 384 \
122 |            --doc_stride 128  \
123 |            --output_dir $OUTPUT_DIR \
124 |            --threads 8 \
125 |            --filter_m 1 --filter_k 20 \
126 |            ${OTHER_ARGS}
127 | }
128 | 
129 | tydiqa() {
130 | python ./examples/run_xqa.py \
131 |        --task_name tydiqa \
132 |        --model_type filter \
133 |        --model_name_or_path $MODEL_NAME_OR_PATH \
134 |        --do_eval \
135 |        --do_lower_case \
136 |        --language ar,bn,en,fi,id,ko,ru,sw,te \
137 |        --eval_splits dev \
138 |        --train_language en \
139 |        --data_dir $DATA_DIR \
140 |        --per_gpu_eval_batch_size 64 \
141 |        --max_seq_length 384 \
142 |        --doc_stride 128  \
143 |        --output_dir $OUTPUT_DIR \
144 |        --threads 8 \
145 |        --filter_m 1 --filter_k 20 \
146 |        ${OTHER_ARGS}
147 | }
148 | 
149 | udpos() {
150 |     python ./examples/run_xtag.py \
151 |            --task_name udpos \
152 |            --model_type filter \
153 |            --data_dir $DATA_DIR/udpos/udpos_processed_maxlen128 \
154 |            --labels $DATA_DIR/udpos/udpos_processed_maxlen128/labels.txt \
155 |            --model_name_or_path $MODEL_NAME_OR_PATH \
156 |            --output_dir $OUTPUT_DIR \
157 |            --train_language en \
158 |            --language 'af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh' \
159 |            --eval_splits dev \
160 |            --max_seq_length 128 \
161 |            --per_gpu_eval_batch_size 64 \
162 |            --do_eval \
163 |            --filter_m 1 --filter_k 1 \
164 |            ${OTHER_ARGS}
165 | }
166 | 
167 | panx() {
168 | python ./examples/run_xtag.py \
169 |        --task_name panx \
170 |        --model_type filter \
171 |        --data_dir $DATA_DIR/panx/panx_processed_maxlen128 \
172 |        --labels $DATA_DIR/panx/panx_processed_maxlen128/labels.txt \
173 |        --model_name_or_path $MODEL_NAME_OR_PATH \
174 |        --output_dir $OUTPUT_DIR \
175 |        --language ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu \
176 |        --eval_splits dev \
177 |        --train_language en \
178 |        --max_seq_length 128 \
179 |        --per_gpu_eval_batch_size 64 \
180 |        --do_eval \
181 |        --filter_m 1 --filter_k 1 \
182 |        ${OTHER_ARGS}
183 | }
184 | 
185 | for task in xnli pawsx mlqa xquad tydiqa panx udpos; do
186 |         if [[ ${TASK:-"xnli"} == $task ]]; then
187 |                 $task
188 |         fi
189 | done
190 | 


--------------------------------------------------------------------------------
/src/transformers/tokenization_flaubert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes for Flaubert, based on XLM."""
 16 | 
 17 | 
 18 | import logging
 19 | import unicodedata
 20 | 
 21 | import six
 22 | 
 23 | from .tokenization_xlm import XLMTokenizer
 24 | 
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | VOCAB_FILES_NAMES = {
 29 |     "vocab_file": "vocab.json",
 30 |     "merges_file": "merges.txt",
 31 | }
 32 | 
 33 | PRETRAINED_VOCAB_FILES_MAP = {
 34 |     "vocab_file": {
 35 |         "flaubert-small-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/vocab.json",
 36 |         "flaubert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/vocab.json",
 37 |         "flaubert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/vocab.json",
 38 |         "flaubert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/vocab.json",
 39 |     },
 40 |     "merges_file": {
 41 |         "flaubert-small-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/merges.txt",
 42 |         "flaubert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/merges.txt",
 43 |         "flaubert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/merges.txt",
 44 |         "flaubert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/merges.txt",
 45 |     },
 46 | }
 47 | 
 48 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 49 |     "flaubert-small-cased": 512,
 50 |     "flaubert-base-uncased": 512,
 51 |     "flaubert-base-cased": 512,
 52 |     "flaubert-large-cased": 512,
 53 | }
 54 | 
 55 | PRETRAINED_INIT_CONFIGURATION = {
 56 |     "flaubert-small-cased": {"do_lowercase": False},
 57 |     "flaubert-base-uncased": {"do_lowercase": True},
 58 |     "flaubert-base-cased": {"do_lowercase": False},
 59 |     "flaubert-large-cased": {"do_lowercase": False},
 60 | }
 61 | 
 62 | 
 63 | def convert_to_unicode(text):
 64 |     """
 65 |     Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
 66 |     """
 67 |     # six_ensure_text is copied from https://github.com/benjaminp/six
 68 |     def six_ensure_text(s, encoding="utf-8", errors="strict"):
 69 |         if isinstance(s, six.binary_type):
 70 |             return s.decode(encoding, errors)
 71 |         elif isinstance(s, six.text_type):
 72 |             return s
 73 |         else:
 74 |             raise TypeError("not expecting type '%s'" % type(s))
 75 | 
 76 |     return six_ensure_text(text, encoding="utf-8", errors="ignore")
 77 | 
 78 | 
 79 | class FlaubertTokenizer(XLMTokenizer):
 80 |     """
 81 |     BPE tokenizer for Flaubert
 82 | 
 83 |     - Moses preprocessing & tokenization
 84 |     - Normalize all inputs text
 85 |     - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
 86 |       (ex: "__classify__") to a vocabulary
 87 |     - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
 88 | 
 89 |     This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples
 90 |     and documentation regarding arguments.
 91 |     """
 92 | 
 93 |     vocab_files_names = VOCAB_FILES_NAMES
 94 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
 95 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
 96 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 97 | 
 98 |     def __init__(self, do_lowercase=False, **kwargs):
 99 |         super().__init__(**kwargs)
100 |         self.do_lowercase = do_lowercase
101 |         self.do_lowercase_and_remove_accent = False
102 | 
103 |     def preprocess_text(self, text):
104 |         text = text.replace("``", '"').replace("''", '"')
105 |         text = convert_to_unicode(text)
106 |         text = unicodedata.normalize("NFC", text)
107 | 
108 |         if self.do_lowercase:
109 |             text = text.lower()
110 | 
111 |         return text
112 | 
113 |     def _tokenize(self, text, bypass_tokenizer=False):
114 |         """
115 |         Tokenize a string given language code using Moses.
116 | 
117 |         Details of tokenization:
118 |         - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
119 |             - Install with `pip install sacremoses`
120 | 
121 |         Args:
122 |             - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)  (bool). If True, we only apply BPE.
123 | 
124 |         Returns:
125 |             List of tokens.
126 |         """
127 |         lang = "fr"
128 |         if lang and self.lang2id and lang not in self.lang2id:
129 |             logger.error(
130 |                 "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model."
131 |             )
132 | 
133 |         if bypass_tokenizer:
134 |             text = text.split()
135 |         else:
136 |             text = self.preprocess_text(text)
137 |             text = self.moses_pipeline(text, lang=lang)
138 |             text = self.moses_tokenize(text, lang=lang)
139 | 
140 |         split_tokens = []
141 |         for token in text:
142 |             if token:
143 |                 split_tokens.extend([t for t in self.bpe(token).split(" ")])
144 | 
145 |         return split_tokens
146 | 


--------------------------------------------------------------------------------
/src/transformers/modeling_camembert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 Inria, Facebook AI Research and the HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """PyTorch CamemBERT model. """
 17 | 
 18 | import logging
 19 | 
 20 | from .configuration_camembert import CamembertConfig
 21 | from .file_utils import add_start_docstrings
 22 | from .modeling_roberta import (
 23 |     RobertaForMaskedLM,
 24 |     RobertaForMultipleChoice,
 25 |     RobertaForQuestionAnswering,
 26 |     RobertaForSequenceClassification,
 27 |     RobertaForTokenClassification,
 28 |     RobertaModel,
 29 | )
 30 | 
 31 | 
 32 | logger = logging.getLogger(__name__)
 33 | 
 34 | CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
 35 |     "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-pytorch_model.bin",
 36 |     "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/pytorch_model.bin",
 37 |     "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/pytorch_model.bin",
 38 | }
 39 | 
 40 | CAMEMBERT_START_DOCSTRING = r"""
 41 | 
 42 |     This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
 43 |     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
 44 |     usage and behavior.
 45 | 
 46 |     Parameters:
 47 |         config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the
 48 |             model. Initializing with a config file does not load the weights associated with the model, only the
 49 |             configuration.
 50 |             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 51 | """
 52 | 
 53 | 
 54 | @add_start_docstrings(
 55 |     "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
 56 |     CAMEMBERT_START_DOCSTRING,
 57 | )
 58 | class CamembertModel(RobertaModel):
 59 |     """
 60 |     This class overrides :class:`~transformers.RobertaModel`. Please check the
 61 |     superclass for the appropriate documentation alongside usage examples.
 62 |     """
 63 | 
 64 |     config_class = CamembertConfig
 65 |     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 66 | 
 67 | 
 68 | @add_start_docstrings(
 69 |     """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
 70 | )
 71 | class CamembertForMaskedLM(RobertaForMaskedLM):
 72 |     """
 73 |     This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the
 74 |     superclass for the appropriate documentation alongside usage examples.
 75 |     """
 76 | 
 77 |     config_class = CamembertConfig
 78 |     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 79 | 
 80 | 
 81 | @add_start_docstrings(
 82 |     """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer
 83 |     on top of the pooled output) e.g. for GLUE tasks. """,
 84 |     CAMEMBERT_START_DOCSTRING,
 85 | )
 86 | class CamembertForSequenceClassification(RobertaForSequenceClassification):
 87 |     """
 88 |     This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the
 89 |     superclass for the appropriate documentation alongside usage examples.
 90 |     """
 91 | 
 92 |     config_class = CamembertConfig
 93 |     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 94 | 
 95 | 
 96 | @add_start_docstrings(
 97 |     """CamemBERT Model with a multiple choice classification head on top (a linear layer on top of
 98 |     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
 99 |     CAMEMBERT_START_DOCSTRING,
100 | )
101 | class CamembertForMultipleChoice(RobertaForMultipleChoice):
102 |     """
103 |     This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the
104 |     superclass for the appropriate documentation alongside usage examples.
105 |     """
106 | 
107 |     config_class = CamembertConfig
108 |     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
109 | 
110 | 
111 | @add_start_docstrings(
112 |     """CamemBERT Model with a token classification head on top (a linear layer on top of
113 |     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
114 |     CAMEMBERT_START_DOCSTRING,
115 | )
116 | class CamembertForTokenClassification(RobertaForTokenClassification):
117 |     """
118 |     This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the
119 |     superclass for the appropriate documentation alongside usage examples.
120 |     """
121 | 
122 |     config_class = CamembertConfig
123 |     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
124 | 
125 | 
126 | @add_start_docstrings(
127 |     """CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD
128 |     (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits` """,
129 |     CAMEMBERT_START_DOCSTRING,
130 | )
131 | class CamembertForQuestionAnswering(RobertaForQuestionAnswering):
132 |     """
133 |     This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the
134 |     superclass for the appropriate documentation alongside usage examples.
135 |     """
136 | 
137 |     config_class = CamembertConfig
138 |     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
139 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_ctrl.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Salesforce and HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Salesforce CTRL configuration """
 16 | 
 17 | 
 18 | import logging
 19 | 
 20 | from .configuration_utils import PretrainedConfig
 21 | 
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"}
 26 | 
 27 | 
 28 | class CTRLConfig(PretrainedConfig):
 29 |     """
 30 |         This is the configuration class to store the configuration of an :class:`~transformers.CTRLModel`.
 31 |         It is used to instantiate an CTRL model according to the specified arguments, defining the model
 32 |         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
 33 |         the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
 34 | 
 35 |         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
 36 |         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
 37 |         for more information.
 38 | 
 39 |         Args:
 40 |             vocab_size (:obj:`int`, optional, defaults to 246534):
 41 |                 Vocabulary size of the CTRL model. Defines the different tokens that
 42 |                 can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
 43 |             n_positions (:obj:`int`, optional, defaults to 256):
 44 |                 The maximum sequence length that this model might ever be used with.
 45 |                 Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
 46 |             n_ctx (:obj:`int`, optional, defaults to 256):
 47 |                 Dimensionality of the causal mask (usually same as n_positions).
 48 |             n_embd (:obj:`int`, optional, defaults to 1280):
 49 |                 Dimensionality of the embeddings and hidden states.
 50 |             dff (:obj:`int`, optional, defaults to 8192):
 51 |                 Dimensionality of the inner dimension of the FFN.
 52 |             n_layer (:obj:`int`, optional, defaults to 48):
 53 |                 Number of hidden layers in the Transformer encoder.
 54 |             n_head (:obj:`int`, optional, defaults to 16):
 55 |                 Number of attention heads for each attention layer in the Transformer encoder.
 56 |             resid_pdrop (:obj:`float`, optional, defaults to 0.1):
 57 |                 The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
 58 |             embd_pdrop (:obj:`int`, optional, defaults to 0.1):
 59 |                 The dropout ratio for the embeddings.
 60 |             attn_pdrop (:obj:`float`, optional, defaults to 0.1):
 61 |                 The dropout ratio for the attention.
 62 |             layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
 63 |                 The epsilon to use in the layer normalization layers
 64 |             initializer_range (:obj:`float`, optional, defaults to 0.02):
 65 |                 The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 66 | 
 67 |         Example::
 68 | 
 69 |             from transformers import CTRLModel, CTRLConfig
 70 | 
 71 |             # Initializing a CTRL configuration
 72 |             configuration = CTRLConfig()
 73 | 
 74 |             # Initializing a model from the configuration
 75 |             model = CTRLModel(configuration)
 76 | 
 77 |             # Accessing the model configuration
 78 |             configuration = model.config
 79 | 
 80 |         Attributes:
 81 |             pretrained_config_archive_map (Dict[str, str]):
 82 |                 A dictionary containing all the available pre-trained checkpoints.
 83 |     """
 84 | 
 85 |     pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
 86 |     model_type = "ctrl"
 87 | 
 88 |     def __init__(
 89 |         self,
 90 |         vocab_size=246534,
 91 |         n_positions=256,
 92 |         n_ctx=256,
 93 |         n_embd=1280,
 94 |         dff=8192,
 95 |         n_layer=48,
 96 |         n_head=16,
 97 |         resid_pdrop=0.1,
 98 |         embd_pdrop=0.1,
 99 |         attn_pdrop=0.1,
100 |         layer_norm_epsilon=1e-6,
101 |         initializer_range=0.02,
102 |         summary_type="cls_index",
103 |         summary_use_proj=True,
104 |         summary_activation=None,
105 |         summary_proj_to_labels=True,
106 |         summary_first_dropout=0.1,
107 |         **kwargs
108 |     ):
109 |         super().__init__(**kwargs)
110 |         self.vocab_size = vocab_size
111 |         self.n_ctx = n_ctx
112 |         self.n_positions = n_positions
113 |         self.n_embd = n_embd
114 |         self.n_layer = n_layer
115 |         self.n_head = n_head
116 |         self.dff = dff
117 |         self.resid_pdrop = resid_pdrop
118 |         self.embd_pdrop = embd_pdrop
119 |         self.attn_pdrop = attn_pdrop
120 |         self.layer_norm_epsilon = layer_norm_epsilon
121 |         self.initializer_range = initializer_range
122 | 
123 |         self.summary_type = summary_type
124 |         self.summary_use_proj = summary_use_proj
125 |         self.summary_activation = summary_activation
126 |         self.summary_first_dropout = summary_first_dropout
127 |         self.summary_proj_to_labels = summary_proj_to_labels
128 | 
129 |     @property
130 |     def max_position_embeddings(self):
131 |         return self.n_positions
132 | 
133 |     @property
134 |     def hidden_size(self):
135 |         return self.n_embd
136 | 
137 |     @property
138 |     def num_attention_heads(self):
139 |         return self.n_head
140 | 
141 |     @property
142 |     def num_hidden_layers(self):
143 |         return self.n_layer
144 | 


--------------------------------------------------------------------------------
/src/transformers/commands/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from argparse import ArgumentParser, Namespace
  3 | from logging import getLogger
  4 | 
  5 | from transformers import SingleSentenceClassificationProcessor as Processor
  6 | from transformers import TextClassificationPipeline, is_tf_available, is_torch_available
  7 | from transformers.commands import BaseTransformersCLICommand
  8 | 
  9 | 
 10 | if not is_tf_available() and not is_torch_available():
 11 |     raise RuntimeError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")
 12 | 
 13 | # TF training parameters
 14 | USE_XLA = False
 15 | USE_AMP = False
 16 | 
 17 | 
 18 | def train_command_factory(args: Namespace):
 19 |     """
 20 |     Factory function used to instantiate serving server from provided command line arguments.
 21 |     :return: ServeCommand
 22 |     """
 23 |     return TrainCommand(args)
 24 | 
 25 | 
 26 | class TrainCommand(BaseTransformersCLICommand):
 27 |     @staticmethod
 28 |     def register_subcommand(parser: ArgumentParser):
 29 |         """
 30 |         Register this command to argparse so it's available for the transformer-cli
 31 |         :param parser: Root parser to register command-specific arguments
 32 |         :return:
 33 |         """
 34 |         train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.")
 35 | 
 36 |         train_parser.add_argument(
 37 |             "--train_data",
 38 |             type=str,
 39 |             required=True,
 40 |             help="path to train (and optionally evaluation) dataset as a csv with "
 41 |             "tab separated labels and sentences.",
 42 |         )
 43 |         train_parser.add_argument(
 44 |             "--column_label", type=int, default=0, help="Column of the dataset csv file with example labels."
 45 |         )
 46 |         train_parser.add_argument(
 47 |             "--column_text", type=int, default=1, help="Column of the dataset csv file with example texts."
 48 |         )
 49 |         train_parser.add_argument(
 50 |             "--column_id", type=int, default=2, help="Column of the dataset csv file with example ids."
 51 |         )
 52 |         train_parser.add_argument(
 53 |             "--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)."
 54 |         )
 55 | 
 56 |         train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.")
 57 |         train_parser.add_argument(
 58 |             "--validation_split",
 59 |             type=float,
 60 |             default=0.1,
 61 |             help="if validation dataset is not provided, fraction of train dataset " "to use as validation dataset.",
 62 |         )
 63 | 
 64 |         train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.")
 65 | 
 66 |         train_parser.add_argument(
 67 |             "--task", type=str, default="text_classification", help="Task to train the model on."
 68 |         )
 69 |         train_parser.add_argument(
 70 |             "--model", type=str, default="bert-base-uncased", help="Model's name or path to stored model."
 71 |         )
 72 |         train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.")
 73 |         train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.")
 74 |         train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.")
 75 |         train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.")
 76 |         train_parser.set_defaults(func=train_command_factory)
 77 | 
 78 |     def __init__(self, args: Namespace):
 79 |         self.logger = getLogger("transformers-cli/training")
 80 | 
 81 |         self.framework = "tf" if is_tf_available() else "torch"
 82 | 
 83 |         os.makedirs(args.output, exist_ok=True)
 84 |         assert os.path.isdir(args.output)
 85 |         self.output = args.output
 86 | 
 87 |         self.column_label = args.column_label
 88 |         self.column_text = args.column_text
 89 |         self.column_id = args.column_id
 90 | 
 91 |         self.logger.info("Loading {} pipeline for {}".format(args.task, args.model))
 92 |         if args.task == "text_classification":
 93 |             self.pipeline = TextClassificationPipeline.from_pretrained(args.model)
 94 |         elif args.task == "token_classification":
 95 |             raise NotImplementedError
 96 |         elif args.task == "question_answering":
 97 |             raise NotImplementedError
 98 | 
 99 |         self.logger.info("Loading dataset from {}".format(args.train_data))
100 |         self.train_dataset = Processor.create_from_csv(
101 |             args.train_data,
102 |             column_label=args.column_label,
103 |             column_text=args.column_text,
104 |             column_id=args.column_id,
105 |             skip_first_row=args.skip_first_row,
106 |         )
107 |         self.valid_dataset = None
108 |         if args.validation_data:
109 |             self.logger.info("Loading validation dataset from {}".format(args.validation_data))
110 |             self.valid_dataset = Processor.create_from_csv(
111 |                 args.validation_data,
112 |                 column_label=args.column_label,
113 |                 column_text=args.column_text,
114 |                 column_id=args.column_id,
115 |                 skip_first_row=args.skip_first_row,
116 |             )
117 | 
118 |         self.validation_split = args.validation_split
119 |         self.train_batch_size = args.train_batch_size
120 |         self.valid_batch_size = args.valid_batch_size
121 |         self.learning_rate = args.learning_rate
122 |         self.adam_epsilon = args.adam_epsilon
123 | 
124 |     def run(self):
125 |         if self.framework == "tf":
126 |             return self.run_tf()
127 |         return self.run_torch()
128 | 
129 |     def run_torch(self):
130 |         raise NotImplementedError
131 | 
132 |     def run_tf(self):
133 |         self.pipeline.fit(
134 |             self.train_dataset,
135 |             validation_data=self.valid_dataset,
136 |             validation_split=self.validation_split,
137 |             learning_rate=self.learning_rate,
138 |             adam_epsilon=self.adam_epsilon,
139 |             train_batch_size=self.train_batch_size,
140 |             valid_batch_size=self.valid_batch_size,
141 |         )
142 | 
143 |         # Save trained pipeline
144 |         self.pipeline.save_pretrained(self.output)
145 | 


--------------------------------------------------------------------------------
/src/transformers/data/metrics/mlqa_evaluation_v1.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | """ Official evaluation script for the MLQA dataset. """
  8 | from __future__ import print_function
  9 | from collections import Counter
 10 | import string
 11 | import re
 12 | import argparse
 13 | import json
 14 | import sys
 15 | import unicodedata
 16 | 
 17 | 
 18 | PUNCT = {chr(i) for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')}.union(string.punctuation)
 19 | WHITESPACE_LANGS = ['en', 'es', 'hi', 'vi', 'de', 'ar']
 20 | MIXED_SEGMENTATION_LANGS = ['zh']
 21 | 
 22 | 
 23 | def whitespace_tokenize(text):
 24 |     return text.split()
 25 | 
 26 | 
 27 | def mixed_segmentation(text):
 28 |     segs_out = []
 29 |     temp_str = ""
 30 |     for char in text:
 31 |         if re.search(r'[\u4e00-\u9fa5]', char) or char in PUNCT:
 32 |             if temp_str != "":
 33 |                 ss = whitespace_tokenize(temp_str)
 34 |                 segs_out.extend(ss)
 35 |                 temp_str = ""
 36 |             segs_out.append(char)
 37 |         else:
 38 |             temp_str += char
 39 | 
 40 |     if temp_str != "":
 41 |         ss = whitespace_tokenize(temp_str)
 42 |         segs_out.extend(ss)
 43 | 
 44 |     return segs_out
 45 | 
 46 | 
 47 | def normalize_answer(s, lang):
 48 |     """Lower text and remove punctuation, articles and extra whitespace."""
 49 | 
 50 |     def remove_articles(text, lang):
 51 |         if lang == 'en':
 52 |             return re.sub(r'\b(a|an|the)\b', ' ', text)
 53 |         elif lang == 'es':
 54 |             return re.sub(r'\b(un|una|unos|unas|el|la|los|las)\b', ' ', text)
 55 |         elif lang == 'hi':
 56 |             return text # Hindi does not have formal articles
 57 |         elif lang == 'vi':
 58 |             return re.sub(r'\b(của|là|cái|chiếc|những)\b', ' ', text)
 59 |         elif lang == 'de':
 60 |             return re.sub(r'\b(ein|eine|einen|einem|eines|einer|der|die|das|den|dem|des)\b', ' ', text)
 61 |         elif lang == 'ar':
 62 |             return re.sub('\sال^|ال', ' ', text)
 63 |         elif lang == 'zh':
 64 |             return text # Chinese does not have formal articles
 65 |         else:
 66 |             raise Exception('Unknown Language {}'.format(lang))
 67 | 
 68 |     def white_space_fix(text, lang):
 69 |         if lang in WHITESPACE_LANGS:
 70 |             tokens = whitespace_tokenize(text)
 71 |         elif lang in MIXED_SEGMENTATION_LANGS:
 72 |             tokens = mixed_segmentation(text)
 73 |         else:
 74 |             raise Exception('Unknown Language {}'.format(lang))
 75 |         return ' '.join([t for t in tokens if t.strip() != ''])
 76 | 
 77 |     def remove_punc(text):
 78 |         return ''.join(ch for ch in text if ch not in PUNCT)
 79 | 
 80 |     def lower(text):
 81 |         return text.lower()
 82 | 
 83 |     return white_space_fix(remove_articles(remove_punc(lower(s)), lang), lang)
 84 | 
 85 | 
 86 | def f1_score(prediction, ground_truth, lang):
 87 |     prediction_tokens = normalize_answer(prediction, lang).split()
 88 |     ground_truth_tokens = normalize_answer(ground_truth, lang).split()
 89 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 90 |     num_same = sum(common.values())
 91 |     if num_same == 0:
 92 |         return 0
 93 |     precision = 1.0 * num_same / len(prediction_tokens)
 94 |     recall = 1.0 * num_same / len(ground_truth_tokens)
 95 |     f1 = (2 * precision * recall) / (precision + recall)
 96 |     return f1
 97 | 
 98 | 
 99 | def exact_match_score(prediction, ground_truth, lang):
100 |     return (normalize_answer(prediction, lang) == normalize_answer(ground_truth, lang))
101 | 
102 | 
103 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths, lang):
104 |     scores_for_ground_truths = []
105 |     for ground_truth in ground_truths:
106 |         score = metric_fn(prediction, ground_truth, lang)
107 |         scores_for_ground_truths.append(score)
108 |     return max(scores_for_ground_truths)
109 | 
110 | 
111 | def evaluate(dataset, predictions, lang):
112 |     f1 = exact_match = total = 0
113 |     for article in dataset:
114 |         for paragraph in article['paragraphs']:
115 |             for qa in paragraph['qas']:
116 |                 total += 1
117 |                 if qa['id'] not in predictions:
118 |                     message = 'Unanswered question ' + qa['id'] + \
119 |                               ' will receive score 0.'
120 |                     print(message, file=sys.stderr)
121 |                     continue
122 |                 ground_truths = list(map(lambda x: x['text'], qa['answers']))
123 |                 prediction = predictions[qa['id']]
124 |                 exact_match += metric_max_over_ground_truths(
125 |                     exact_match_score, prediction, ground_truths, lang)
126 |                 f1 += metric_max_over_ground_truths(
127 |                     f1_score, prediction, ground_truths, lang)
128 | 
129 |     exact_match = 100.0 * exact_match / total
130 |     f1 = 100.0 * f1 / total
131 | 
132 |     return {'exact_match': exact_match, 'f1': f1}
133 | 
134 | 
135 | def evaluate_with_path(dataset_file, prediction_file, answer_language):
136 |     with open(dataset_file) as dataset_file_reader:
137 |         dataset_json = json.load(dataset_file_reader)
138 |         dataset = dataset_json['data']
139 |     with open(prediction_file) as prediction_file_reader:
140 |         predictions = json.load(prediction_file_reader)
141 |     return evaluate(dataset, predictions, answer_language)
142 | 
143 | if __name__ == '__main__':
144 |     expected_version = '1.0'
145 |     parser = argparse.ArgumentParser(
146 |         description='Evaluation for MLQA ' + expected_version)
147 |     parser.add_argument('dataset_file', help='Dataset file')
148 |     parser.add_argument('prediction_file', help='Prediction File')
149 |     parser.add_argument('answer_language', help='Language code of answer language')
150 | 
151 |     args = parser.parse_args()
152 |     with open(args.dataset_file) as dataset_file:
153 |         dataset_json = json.load(dataset_file)
154 |         if (str(dataset_json['version']) != expected_version):
155 |             print('Evaluation expects v-' + expected_version +
156 |                   ', but got dataset with v-' + dataset_json['version'],
157 |                   file=sys.stderr)
158 |         dataset = dataset_json['data']
159 |     with open(args.prediction_file) as prediction_file:
160 |         predictions = json.load(prediction_file)
161 |     print(json.dumps(evaluate(dataset, predictions, args.answer_language)))
162 | 


--------------------------------------------------------------------------------
/src/transformers/hf_api.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019-present, the HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import io
 18 | import os
 19 | from os.path import expanduser
 20 | from typing import List
 21 | 
 22 | import requests
 23 | from tqdm import tqdm
 24 | 
 25 | 
 26 | ENDPOINT = "https://huggingface.co"
 27 | 
 28 | 
 29 | class S3Obj:
 30 |     def __init__(self, filename: str, LastModified: str, ETag: str, Size: int, **kwargs):
 31 |         self.filename = filename
 32 |         self.LastModified = LastModified
 33 |         self.ETag = ETag
 34 |         self.Size = Size
 35 | 
 36 | 
 37 | class PresignedUrl:
 38 |     def __init__(self, write: str, access: str, type: str, **kwargs):
 39 |         self.write = write
 40 |         self.access = access
 41 |         self.type = type  # mime-type to send to S3.
 42 | 
 43 | 
 44 | class HfApi:
 45 |     def __init__(self, endpoint=None):
 46 |         self.endpoint = endpoint if endpoint is not None else ENDPOINT
 47 | 
 48 |     def login(self, username: str, password: str) -> str:
 49 |         """
 50 |         Call HF API to sign in a user and get a token if credentials are valid.
 51 | 
 52 |         Outputs:
 53 |             token if credentials are valid
 54 | 
 55 |         Throws:
 56 |             requests.exceptions.HTTPError if credentials are invalid
 57 |         """
 58 |         path = "{}/api/login".format(self.endpoint)
 59 |         r = requests.post(path, json={"username": username, "password": password})
 60 |         r.raise_for_status()
 61 |         d = r.json()
 62 |         return d["token"]
 63 | 
 64 |     def whoami(self, token: str) -> str:
 65 |         """
 66 |         Call HF API to know "whoami"
 67 |         """
 68 |         path = "{}/api/whoami".format(self.endpoint)
 69 |         r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
 70 |         r.raise_for_status()
 71 |         d = r.json()
 72 |         return d["user"]
 73 | 
 74 |     def logout(self, token: str) -> None:
 75 |         """
 76 |         Call HF API to log out.
 77 |         """
 78 |         path = "{}/api/logout".format(self.endpoint)
 79 |         r = requests.post(path, headers={"authorization": "Bearer {}".format(token)})
 80 |         r.raise_for_status()
 81 | 
 82 |     def presign(self, token: str, filename: str) -> PresignedUrl:
 83 |         """
 84 |         Call HF API to get a presigned url to upload `filename` to S3.
 85 |         """
 86 |         path = "{}/api/presign".format(self.endpoint)
 87 |         r = requests.post(path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename})
 88 |         r.raise_for_status()
 89 |         d = r.json()
 90 |         return PresignedUrl(**d)
 91 | 
 92 |     def presign_and_upload(self, token: str, filename: str, filepath: str) -> str:
 93 |         """
 94 |         Get a presigned url, then upload file to S3.
 95 | 
 96 |         Outputs:
 97 |             url: Read-only url for the stored file on S3.
 98 |         """
 99 |         urls = self.presign(token, filename=filename)
100 |         # streaming upload:
101 |         # https://2.python-requests.org/en/master/user/advanced/#streaming-uploads
102 |         #
103 |         # Even though we presign with the correct content-type,
104 |         # the client still has to specify it when uploading the file.
105 |         with open(filepath, "rb") as f:
106 |             pf = TqdmProgressFileReader(f)
107 |             data = f if pf.total_size > 0 else ""
108 | 
109 |             r = requests.put(urls.write, data=data, headers={"content-type": urls.type})
110 |             r.raise_for_status()
111 |             pf.close()
112 |         return urls.access
113 | 
114 |     def list_objs(self, token: str) -> List[S3Obj]:
115 |         """
116 |         Call HF API to list all stored files for user.
117 |         """
118 |         path = "{}/api/listObjs".format(self.endpoint)
119 |         r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
120 |         r.raise_for_status()
121 |         d = r.json()
122 |         return [S3Obj(**x) for x in d]
123 | 
124 |     def delete_obj(self, token: str, filename: str):
125 |         """
126 |         Call HF API to delete a file stored by user
127 |         """
128 |         path = "{}/api/deleteObj".format(self.endpoint)
129 |         r = requests.delete(path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename})
130 |         r.raise_for_status()
131 | 
132 | 
133 | class TqdmProgressFileReader:
134 |     """
135 |     Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`)
136 |     and override `f.read()` so as to display a tqdm progress bar.
137 | 
138 |     see github.com/huggingface/transformers/pull/2078#discussion_r354739608
139 |     for implementation details.
140 |     """
141 | 
142 |     def __init__(self, f: io.BufferedReader):
143 |         self.f = f
144 |         self.total_size = os.fstat(f.fileno()).st_size
145 |         self.pbar = tqdm(total=self.total_size, leave=False)
146 |         self.read = f.read
147 |         f.read = self._read
148 | 
149 |     def _read(self, n=-1):
150 |         self.pbar.update(n)
151 |         return self.read(n)
152 | 
153 |     def close(self):
154 |         self.pbar.close()
155 | 
156 | 
157 | class HfFolder:
158 |     path_token = expanduser("~/.huggingface/token")
159 | 
160 |     @classmethod
161 |     def save_token(cls, token):
162 |         """
163 |         Save token, creating folder as needed.
164 |         """
165 |         os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
166 |         with open(cls.path_token, "w+") as f:
167 |             f.write(token)
168 | 
169 |     @classmethod
170 |     def get_token(cls):
171 |         """
172 |         Get token or None if not existent.
173 |         """
174 |         try:
175 |             with open(cls.path_token, "r") as f:
176 |                 return f.read()
177 |         except FileNotFoundError:
178 |             pass
179 | 
180 |     @classmethod
181 |     def delete_token(cls):
182 |         """
183 |         Delete token.
184 |         Do not fail if token does not exist.
185 |         """
186 |         try:
187 |             os.remove(cls.path_token)
188 |         except FileNotFoundError:
189 |             pass
190 | 


--------------------------------------------------------------------------------
/src/transformers/commands/convert.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser, Namespace
  2 | from logging import getLogger
  3 | 
  4 | from transformers.commands import BaseTransformersCLICommand
  5 | 
  6 | 
  7 | def convert_command_factory(args: Namespace):
  8 |     """
  9 |     Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint.
 10 |     :return: ServeCommand
 11 |     """
 12 |     return ConvertCommand(
 13 |         args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name
 14 |     )
 15 | 
 16 | 
 17 | class ConvertCommand(BaseTransformersCLICommand):
 18 |     @staticmethod
 19 |     def register_subcommand(parser: ArgumentParser):
 20 |         """
 21 |         Register this command to argparse so it's available for the transformer-cli
 22 |         :param parser: Root parser to register command-specific arguments
 23 |         :return:
 24 |         """
 25 |         train_parser = parser.add_parser(
 26 |             "convert",
 27 |             help="CLI tool to run convert model from original "
 28 |             "author checkpoints to Transformers PyTorch checkpoints.",
 29 |         )
 30 |         train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.")
 31 |         train_parser.add_argument(
 32 |             "--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
 33 |         )
 34 |         train_parser.add_argument(
 35 |             "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch savd model output."
 36 |         )
 37 |         train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
 38 |         train_parser.add_argument(
 39 |             "--finetuning_task_name",
 40 |             type=str,
 41 |             default=None,
 42 |             help="Optional fine-tuning task name if the TF model was a finetuned model.",
 43 |         )
 44 |         train_parser.set_defaults(func=convert_command_factory)
 45 | 
 46 |     def __init__(
 47 |         self,
 48 |         model_type: str,
 49 |         tf_checkpoint: str,
 50 |         pytorch_dump_output: str,
 51 |         config: str,
 52 |         finetuning_task_name: str,
 53 |         *args
 54 |     ):
 55 |         self._logger = getLogger("transformers-cli/converting")
 56 | 
 57 |         self._logger.info("Loading model {}".format(model_type))
 58 |         self._model_type = model_type
 59 |         self._tf_checkpoint = tf_checkpoint
 60 |         self._pytorch_dump_output = pytorch_dump_output
 61 |         self._config = config
 62 |         self._finetuning_task_name = finetuning_task_name
 63 | 
 64 |     def run(self):
 65 |         if self._model_type == "bert":
 66 |             try:
 67 |                 from transformers.convert_bert_original_tf_checkpoint_to_pytorch import (
 68 |                     convert_tf_checkpoint_to_pytorch,
 69 |                 )
 70 |             except ImportError:
 71 |                 msg = (
 72 |                     "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 73 |                     "In that case, it requires TensorFlow to be installed. Please see "
 74 |                     "https://www.tensorflow.org/install/ for installation instructions."
 75 |                 )
 76 |                 raise ImportError(msg)
 77 | 
 78 |             convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
 79 |         elif self._model_type == "gpt":
 80 |             from transformers.convert_openai_original_tf_checkpoint_to_pytorch import (
 81 |                 convert_openai_checkpoint_to_pytorch,
 82 |             )
 83 | 
 84 |             convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
 85 |         elif self._model_type == "transfo_xl":
 86 |             try:
 87 |                 from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import (
 88 |                     convert_transfo_xl_checkpoint_to_pytorch,
 89 |                 )
 90 |             except ImportError:
 91 |                 msg = (
 92 |                     "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 93 |                     "In that case, it requires TensorFlow to be installed. Please see "
 94 |                     "https://www.tensorflow.org/install/ for installation instructions."
 95 |                 )
 96 |                 raise ImportError(msg)
 97 | 
 98 |             if "ckpt" in self._tf_checkpoint.lower():
 99 |                 TF_CHECKPOINT = self._tf_checkpoint
100 |                 TF_DATASET_FILE = ""
101 |             else:
102 |                 TF_DATASET_FILE = self._tf_checkpoint
103 |                 TF_CHECKPOINT = ""
104 |             convert_transfo_xl_checkpoint_to_pytorch(
105 |                 TF_CHECKPOINT, self._config, self._pytorch_dump_output, TF_DATASET_FILE
106 |             )
107 |         elif self._model_type == "gpt2":
108 |             try:
109 |                 from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import (
110 |                     convert_gpt2_checkpoint_to_pytorch,
111 |                 )
112 |             except ImportError:
113 |                 msg = (
114 |                     "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
115 |                     "In that case, it requires TensorFlow to be installed. Please see "
116 |                     "https://www.tensorflow.org/install/ for installation instructions."
117 |                 )
118 |                 raise ImportError(msg)
119 | 
120 |             convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
121 |         elif self._model_type == "xlnet":
122 |             try:
123 |                 from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import (
124 |                     convert_xlnet_checkpoint_to_pytorch,
125 |                 )
126 |             except ImportError:
127 |                 msg = (
128 |                     "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
129 |                     "In that case, it requires TensorFlow to be installed. Please see "
130 |                     "https://www.tensorflow.org/install/ for installation instructions."
131 |                 )
132 |                 raise ImportError(msg)
133 | 
134 |             convert_xlnet_checkpoint_to_pytorch(
135 |                 self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name
136 |             )
137 |         elif self._model_type == "xlm":
138 |             from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import (
139 |                 convert_xlm_checkpoint_to_pytorch,
140 |             )
141 | 
142 |             convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
143 |         else:
144 |             raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, transfo_xl, xlnet, xlm]")
145 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_distilbert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ DistilBERT model configuration """
 16 | 
 17 | 
 18 | import logging
 19 | 
 20 | from .configuration_utils import PretrainedConfig
 21 | 
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 26 |     "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
 27 |     "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json",
 28 |     "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json",
 29 |     "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-distilled-squad-config.json",
 30 |     "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json",
 31 |     "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json",
 32 |     "distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json",
 33 | }
 34 | 
 35 | 
 36 | class DistilBertConfig(PretrainedConfig):
 37 |     r"""
 38 |         This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
 39 |         It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
 40 |         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
 41 |         the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
 42 | 
 43 |         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
 44 |         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
 45 |         for more information.
 46 | 
 47 | 
 48 |         Args:
 49 |             vocab_size (:obj:`int`, optional, defaults to 30522):
 50 |                 Vocabulary size of the DistilBERT model. Defines the different tokens that
 51 |                 can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
 52 |             max_position_embeddings (:obj:`int`, optional, defaults to 512):
 53 |                 The maximum sequence length that this model might ever be used with.
 54 |                 Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
 55 |             sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
 56 |                 Whether to use sinusoidal positional embeddings.
 57 |             n_layers (:obj:`int`, optional, defaults to 6):
 58 |                 Number of hidden layers in the Transformer encoder.
 59 |             n_heads (:obj:`int`, optional, defaults to 12):
 60 |                 Number of attention heads for each attention layer in the Transformer encoder.
 61 |             dim (:obj:`int`, optional, defaults to 768):
 62 |                 Dimensionality of the encoder layers and the pooler layer.
 63 |             hidden_dim (:obj:`int`, optional, defaults to 3072):
 64 |                 The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 65 |             dropout (:obj:`float`, optional, defaults to 0.1):
 66 |                 The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
 67 |             attention_dropout (:obj:`float`, optional, defaults to 0.1):
 68 |                 The dropout ratio for the attention probabilities.
 69 |             activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
 70 |                 The non-linear activation function (function or string) in the encoder and pooler.
 71 |                 If string, "gelu", "relu", "swish" and "gelu_new" are supported.
 72 |             initializer_range (:obj:`float`, optional, defaults to 0.02):
 73 |                 The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 74 |             qa_dropout (:obj:`float`, optional, defaults to 0.1):
 75 |                 The dropout probabilities used in the question answering model
 76 |                 :class:`~tranformers.DistilBertForQuestionAnswering`.
 77 |             seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
 78 |                 The dropout probabilities used in the sequence classification model
 79 |                 :class:`~tranformers.DistilBertForSequenceClassification`.
 80 | 
 81 |         Example::
 82 | 
 83 |             from transformers import DistilBertModel, DistilBertConfig
 84 | 
 85 |             # Initializing a DistilBERT configuration
 86 |             configuration = DistilBertConfig()
 87 | 
 88 |             # Initializing a model from the configuration
 89 |             model = DistilBertModel(configuration)
 90 | 
 91 |             # Accessing the model configuration
 92 |             configuration = model.config
 93 | 
 94 |         Attributes:
 95 |             pretrained_config_archive_map (Dict[str, str]):
 96 |                 A dictionary containing all the available pre-trained checkpoints.
 97 |     """
 98 |     pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 99 |     model_type = "distilbert"
100 | 
101 |     def __init__(
102 |         self,
103 |         vocab_size=30522,
104 |         max_position_embeddings=512,
105 |         sinusoidal_pos_embds=False,
106 |         n_layers=6,
107 |         n_heads=12,
108 |         dim=768,
109 |         hidden_dim=4 * 768,
110 |         dropout=0.1,
111 |         attention_dropout=0.1,
112 |         activation="gelu",
113 |         initializer_range=0.02,
114 |         qa_dropout=0.1,
115 |         seq_classif_dropout=0.2,
116 |         **kwargs
117 |     ):
118 |         super().__init__(**kwargs)
119 |         self.vocab_size = vocab_size
120 |         self.max_position_embeddings = max_position_embeddings
121 |         self.sinusoidal_pos_embds = sinusoidal_pos_embds
122 |         self.n_layers = n_layers
123 |         self.n_heads = n_heads
124 |         self.dim = dim
125 |         self.hidden_dim = hidden_dim
126 |         self.dropout = dropout
127 |         self.attention_dropout = attention_dropout
128 |         self.activation = activation
129 |         self.initializer_range = initializer_range
130 |         self.qa_dropout = qa_dropout
131 |         self.seq_classif_dropout = seq_classif_dropout
132 | 
133 |     @property
134 |     def hidden_size(self):
135 |         return self.dim
136 | 
137 |     @property
138 |     def num_attention_heads(self):
139 |         return self.n_heads
140 | 
141 |     @property
142 |     def num_hidden_layers(self):
143 |         return self.n_layers
144 | 


--------------------------------------------------------------------------------
/src/transformers/tokenization_t5.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 T5 Authors and HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Tokenization class for model T5."""
 16 | 
 17 | 
 18 | import logging
 19 | import os
 20 | import re
 21 | from shutil import copyfile
 22 | 
 23 | from .tokenization_utils import PreTrainedTokenizer
 24 | 
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | SPIECE_UNDERLINE = "▁"
 29 | 
 30 | ####################################################
 31 | # Mapping from the keyword arguments names of Tokenizer `__init__`
 32 | # to file names for serializing Tokenizer instances
 33 | ####################################################
 34 | VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
 35 | 
 36 | ####################################################
 37 | # Mapping from the keyword arguments names of Tokenizer `__init__`
 38 | # to pretrained vocabulary URL for all the model shortcut names.
 39 | ####################################################
 40 | PRETRAINED_VOCAB_FILES_MAP = {
 41 |     "vocab_file": {
 42 |         "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
 43 |         "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
 44 |         "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
 45 |         "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
 46 |         "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
 47 |     }
 48 | }
 49 | 
 50 | ####################################################
 51 | # Mapping from model shortcut names to max length of inputs
 52 | ####################################################
 53 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 54 |     "t5-small": 512,
 55 |     "t5-base": 512,
 56 |     "t5-large": 512,
 57 |     "t5-3b": 512,
 58 |     "t5-11b": 512,
 59 | }
 60 | 
 61 | 
 62 | class T5Tokenizer(PreTrainedTokenizer):
 63 |     """
 64 |         SentencePiece based tokenizer. Peculiarities:
 65 | 
 66 |             - requires `SentencePiece <https://github.com/google/sentencepiece>`_
 67 |             - `extra_ids` add a number of extra ids added to the end of the vocabulary for use as sentinels.
 68 |                 These tokens are accessible as `<extra_id_{%d}>` where `{%d}` is a number between 0 and extra_ids-1.
 69 |                 Extra tokens are indexed from the end of the vocabulary up to beginnning (<extra_id_0> is the last token in the vocabulary)
 70 |                 (like in T5 preprocessing
 71 |                 see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
 72 |     """
 73 | 
 74 |     vocab_files_names = VOCAB_FILES_NAMES
 75 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
 76 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 77 | 
 78 |     def __init__(
 79 |         self,
 80 |         vocab_file,
 81 |         eos_token="</s>",
 82 |         unk_token="<unk>",
 83 |         pad_token="<pad>",
 84 |         extra_ids=100,
 85 |         additional_special_tokens=None,
 86 |         **kwargs
 87 |     ):
 88 |         # Add extra_ids to the special token list
 89 |         if extra_ids > 0:
 90 |             if additional_special_tokens is None:
 91 |                 additional_special_tokens = []
 92 |             additional_special_tokens.extend(["<extra_id_{}>".format(i) for i in range(extra_ids)])
 93 | 
 94 |         super().__init__(
 95 |             eos_token=eos_token,
 96 |             unk_token=unk_token,
 97 |             pad_token=pad_token,
 98 |             additional_special_tokens=additional_special_tokens,
 99 |             **kwargs,
100 |         )
101 |         self.max_len_single_sentence = (
102 |             self.max_len
103 |         )  # no default special tokens - you can update this value if you add special tokens
104 |         self.max_len_sentences_pair = (
105 |             self.max_len
106 |         )  # no default special tokens - you can update this value if you add special tokens
107 | 
108 |         try:
109 |             import sentencepiece as spm
110 |         except ImportError:
111 |             logger.warning(
112 |                 "You need to install SentencePiece to use T5Tokenizer:"
113 |                 "https://github.com/google/sentencepiece"
114 |                 "pip install sentencepiece"
115 |             )
116 |             raise
117 | 
118 |         self.vocab_file = vocab_file
119 |         self._extra_ids = extra_ids
120 | 
121 |         self.sp_model = spm.SentencePieceProcessor()
122 |         self.sp_model.Load(vocab_file)
123 | 
124 |     @property
125 |     def vocab_size(self):
126 |         return self.sp_model.get_piece_size() + self._extra_ids
127 | 
128 |     def get_vocab(self):
129 |         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
130 |         vocab.update(self.added_tokens_encoder)
131 |         return vocab
132 | 
133 |     def __getstate__(self):
134 |         state = self.__dict__.copy()
135 |         state["sp_model"] = None
136 |         return state
137 | 
138 |     def __setstate__(self, d):
139 |         self.__dict__ = d
140 |         try:
141 |             import sentencepiece as spm
142 |         except ImportError:
143 |             logger.warning(
144 |                 "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
145 |                 "pip install sentencepiece"
146 |             )
147 |             raise
148 |         self.sp_model = spm.SentencePieceProcessor()
149 |         self.sp_model.Load(self.vocab_file)
150 | 
151 |     def _tokenize(self, text, sample=False):
152 |         """ Take as input a string and return a list of strings (tokens) for words/sub-words
153 |         """
154 |         if not sample:
155 |             pieces = self.sp_model.EncodeAsPieces(text)
156 |         else:
157 |             pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
158 |         return pieces
159 | 
160 |     def _convert_token_to_id(self, token):
161 |         """ Converts a token (str) in an id using the vocab. """
162 |         if token.startswith("<extra_id_"):
163 |             match = re.match(r"<extra_id_(\d+)>", token)
164 |             num = int(match.group(1))
165 |             return self.vocab_size - num - 1
166 |         return self.sp_model.piece_to_id(token)
167 | 
168 |     def _convert_id_to_token(self, index):
169 |         """Converts an index (integer) in a token (str) using the vocab."""
170 |         if index < self.sp_model.get_piece_size():
171 |             token = self.sp_model.IdToPiece(index)
172 |         else:
173 |             token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
174 |         return token
175 | 
176 |     def convert_tokens_to_string(self, tokens):
177 |         """ Converts a sequence of tokens (string) in a single string. """
178 |         out_string = self.sp_model.decode_pieces(tokens)
179 |         return out_string
180 | 
181 |     def save_vocabulary(self, save_directory):
182 |         """ Save the sentencepiece vocabulary (copy original file) and special tokens file
183 |             to a directory.
184 |         """
185 |         if not os.path.isdir(save_directory):
186 |             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
187 |             return
188 |         out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
189 | 
190 |         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
191 |             copyfile(self.vocab_file, out_vocab_file)
192 | 
193 |         return (out_vocab_file,)
194 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_albert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ ALBERT model configuration """
 17 | 
 18 | from .configuration_utils import PretrainedConfig
 19 | 
 20 | 
 21 | ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 22 |     "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json",
 23 |     "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json",
 24 |     "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json",
 25 |     "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json",
 26 |     "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json",
 27 |     "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json",
 28 |     "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json",
 29 |     "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json",
 30 | }
 31 | 
 32 | 
 33 | class AlbertConfig(PretrainedConfig):
 34 |     r"""
 35 |         This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`.
 36 |         It is used to instantiate an ALBERT model according to the specified arguments, defining the model
 37 |         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
 38 |         the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
 39 | 
 40 |         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
 41 |         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
 42 |         for more information.
 43 | 
 44 | 
 45 |         Args:
 46 |             vocab_size (:obj:`int`, optional, defaults to 30000):
 47 |                 Vocabulary size of the ALBERT model. Defines the different tokens that
 48 |                 can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
 49 |             embedding_size (:obj:`int`, optional, defaults to 128):
 50 |                 Dimensionality of vocabulary embeddings.
 51 |             hidden_size (:obj:`int`, optional, defaults to 4096):
 52 |                 Dimensionality of the encoder layers and the pooler layer.
 53 |             num_hidden_layers (:obj:`int`, optional, defaults to 12):
 54 |                 Number of hidden layers in the Transformer encoder.
 55 |             num_hidden_groups (:obj:`int`, optional, defaults to 1):
 56 |                 Number of groups for the hidden layers, parameters in the same group are shared.
 57 |             num_attention_heads (:obj:`int`, optional, defaults to 64):
 58 |                 Number of attention heads for each attention layer in the Transformer encoder.
 59 |             intermediate_size (:obj:`int`, optional, defaults to 16384):
 60 |                 The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 61 |             inner_group_num (:obj:`int`, optional, defaults to 1):
 62 |                 The number of inner repetition of attention and ffn.
 63 |             hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
 64 |                 The non-linear activation function (function or string) in the encoder and pooler.
 65 |                 If string, "gelu", "relu", "swish" and "gelu_new" are supported.
 66 |             hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
 67 |                 The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
 68 |             attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
 69 |                 The dropout ratio for the attention probabilities.
 70 |             max_position_embeddings (:obj:`int`, optional, defaults to 512):
 71 |                 The maximum sequence length that this model might ever be used with. Typically set this to something
 72 |                 large (e.g., 512 or 1024 or 2048).
 73 |             type_vocab_size (:obj:`int`, optional, defaults to 2):
 74 |                 The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
 75 |             initializer_range (:obj:`float`, optional, defaults to 0.02):
 76 |                 The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 77 |             layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
 78 |                 The epsilon used by the layer normalization layers.
 79 |             classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
 80 |                 The dropout ratio for attached classifiers.
 81 | 
 82 |         Example::
 83 | 
 84 |             from transformers import AlbertConfig, AlbertModel
 85 |             # Initializing an ALBERT-xxlarge style configuration
 86 |             albert_xxlarge_configuration = AlbertConfig()
 87 | 
 88 |             # Initializing an ALBERT-base style configuration
 89 |             albert_base_configuration = AlbertConfig(
 90 |                 hidden_size=768,
 91 |                 num_attention_heads=12,
 92 |                 intermediate_size=3072,
 93 |             )
 94 | 
 95 |             # Initializing a model from the ALBERT-base style configuration
 96 |             model = AlbertModel(albert_xxlarge_configuration)
 97 | 
 98 |             # Accessing the model configuration
 99 |             configuration = model.config
100 | 
101 |         Attributes:
102 |             pretrained_config_archive_map (Dict[str, str]):
103 |                 A dictionary containing all the available pre-trained checkpoints.
104 |     """
105 | 
106 |     pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
107 |     model_type = "albert"
108 | 
109 |     def __init__(
110 |         self,
111 |         vocab_size=30000,
112 |         embedding_size=128,
113 |         hidden_size=4096,
114 |         num_hidden_layers=12,
115 |         num_hidden_groups=1,
116 |         num_attention_heads=64,
117 |         intermediate_size=16384,
118 |         inner_group_num=1,
119 |         hidden_act="gelu_new",
120 |         hidden_dropout_prob=0,
121 |         attention_probs_dropout_prob=0,
122 |         max_position_embeddings=512,
123 |         type_vocab_size=2,
124 |         initializer_range=0.02,
125 |         layer_norm_eps=1e-12,
126 |         classifier_dropout_prob=0.1,
127 |         **kwargs
128 |     ):
129 |         super().__init__(**kwargs)
130 | 
131 |         self.vocab_size = vocab_size
132 |         self.embedding_size = embedding_size
133 |         self.hidden_size = hidden_size
134 |         self.num_hidden_layers = num_hidden_layers
135 |         self.num_hidden_groups = num_hidden_groups
136 |         self.num_attention_heads = num_attention_heads
137 |         self.inner_group_num = inner_group_num
138 |         self.hidden_act = hidden_act
139 |         self.intermediate_size = intermediate_size
140 |         self.hidden_dropout_prob = hidden_dropout_prob
141 |         self.attention_probs_dropout_prob = attention_probs_dropout_prob
142 |         self.max_position_embeddings = max_position_embeddings
143 |         self.type_vocab_size = type_vocab_size
144 |         self.initializer_range = initializer_range
145 |         self.layer_norm_eps = layer_norm_eps
146 |         self.classifier_dropout_prob = classifier_dropout_prob
147 | 


--------------------------------------------------------------------------------
/src/transformers/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """PyTorch optimization for BERT model."""
 16 | 
 17 | import logging
 18 | import math
 19 | 
 20 | import torch
 21 | from torch.optim import Optimizer
 22 | from torch.optim.lr_scheduler import LambdaLR
 23 | 
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | 
 28 | def get_constant_schedule(optimizer, last_epoch=-1):
 29 |     """ Create a schedule with a constant learning rate.
 30 |     """
 31 |     return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
 32 | 
 33 | 
 34 | def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1):
 35 |     """ Create a schedule with a constant learning rate preceded by a warmup
 36 |     period during which the learning rate increases linearly between 0 and 1.
 37 |     """
 38 | 
 39 |     def lr_lambda(current_step):
 40 |         if current_step < num_warmup_steps:
 41 |             return float(current_step) / float(max(1.0, num_warmup_steps))
 42 |         return 1.0
 43 | 
 44 |     return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
 45 | 
 46 | 
 47 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
 48 |     """ Create a schedule with a learning rate that decreases linearly after
 49 |     linearly increasing during a warmup period.
 50 |     """
 51 | 
 52 |     def lr_lambda(current_step):
 53 |         if current_step < num_warmup_steps:
 54 |             return float(current_step) / float(max(1, num_warmup_steps))
 55 |         return max(
 56 |             0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
 57 |         )
 58 | 
 59 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
 60 | 
 61 | 
 62 | def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1):
 63 |     """ Create a schedule with a learning rate that decreases following the
 64 |     values of the cosine function between 0 and `pi * cycles` after a warmup
 65 |     period during which it increases linearly between 0 and 1.
 66 |     """
 67 | 
 68 |     def lr_lambda(current_step):
 69 |         if current_step < num_warmup_steps:
 70 |             return float(current_step) / float(max(1, num_warmup_steps))
 71 |         progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
 72 |         return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
 73 | 
 74 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
 75 | 
 76 | 
 77 | def get_cosine_with_hard_restarts_schedule_with_warmup(
 78 |     optimizer, num_warmup_steps, num_training_steps, num_cycles=1.0, last_epoch=-1
 79 | ):
 80 |     """ Create a schedule with a learning rate that decreases following the
 81 |     values of the cosine function with several hard restarts, after a warmup
 82 |     period during which it increases linearly between 0 and 1.
 83 |     """
 84 | 
 85 |     def lr_lambda(current_step):
 86 |         if current_step < num_warmup_steps:
 87 |             return float(current_step) / float(max(1, num_warmup_steps))
 88 |         progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
 89 |         if progress >= 1.0:
 90 |             return 0.0
 91 |         return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
 92 | 
 93 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
 94 | 
 95 | 
 96 | class AdamW(Optimizer):
 97 |     """ Implements Adam algorithm with weight decay fix.
 98 | 
 99 |     Parameters:
100 |         lr (float): learning rate. Default 1e-3.
101 |         betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999)
102 |         eps (float): Adams epsilon. Default: 1e-6
103 |         weight_decay (float): Weight decay. Default: 0.0
104 |         correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
105 |     """
106 | 
107 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
108 |         if lr < 0.0:
109 |             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
110 |         if not 0.0 <= betas[0] < 1.0:
111 |             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
112 |         if not 0.0 <= betas[1] < 1.0:
113 |             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
114 |         if not 0.0 <= eps:
115 |             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
116 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
117 |         super().__init__(params, defaults)
118 | 
119 |     def step(self, closure=None):
120 |         """Performs a single optimization step.
121 | 
122 |         Arguments:
123 |             closure (callable, optional): A closure that reevaluates the model
124 |                 and returns the loss.
125 |         """
126 |         loss = None
127 |         if closure is not None:
128 |             loss = closure()
129 | 
130 |         for group in self.param_groups:
131 |             for p in group["params"]:
132 |                 if p.grad is None:
133 |                     continue
134 |                 grad = p.grad.data
135 |                 if grad.is_sparse:
136 |                     raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
137 | 
138 |                 state = self.state[p]
139 | 
140 |                 # State initialization
141 |                 if len(state) == 0:
142 |                     state["step"] = 0
143 |                     # Exponential moving average of gradient values
144 |                     state["exp_avg"] = torch.zeros_like(p.data)
145 |                     # Exponential moving average of squared gradient values
146 |                     state["exp_avg_sq"] = torch.zeros_like(p.data)
147 | 
148 |                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
149 |                 beta1, beta2 = group["betas"]
150 | 
151 |                 state["step"] += 1
152 | 
153 |                 # Decay the first and second moment running average coefficient
154 |                 # In-place operations to update the averages at the same time
155 |                 exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
156 |                 exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
157 |                 denom = exp_avg_sq.sqrt().add_(group["eps"])
158 | 
159 |                 step_size = group["lr"]
160 |                 if group["correct_bias"]:  # No bias correction for Bert
161 |                     bias_correction1 = 1.0 - beta1 ** state["step"]
162 |                     bias_correction2 = 1.0 - beta2 ** state["step"]
163 |                     step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
164 | 
165 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
166 | 
167 |                 # Just adding the square of the weights to the loss function is *not*
168 |                 # the correct way of using L2 regularization/weight decay with Adam,
169 |                 # since that will interact with the m and v parameters in strange ways.
170 |                 #
171 |                 # Instead we want to decay the weights in a manner that doesn't interact
172 |                 # with the m/v parameters. This is equivalent to adding the square
173 |                 # of the weights to the loss with plain (non-momentum) SGD.
174 |                 # Add weight decay at the end (fixed version)
175 |                 if group["weight_decay"] > 0.0:
176 |                     p.data.add_(-group["lr"] * group["weight_decay"], p.data)
177 | 
178 |         return loss
179 | 


--------------------------------------------------------------------------------
/src/transformers/commands/serving.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from argparse import ArgumentParser, Namespace
  3 | from typing import Any, List, Optional
  4 | 
  5 | from transformers import Pipeline
  6 | from transformers.commands import BaseTransformersCLICommand
  7 | from transformers.pipelines import SUPPORTED_TASKS, pipeline
  8 | 
  9 | 
 10 | try:
 11 |     from uvicorn import run
 12 |     from fastapi import FastAPI, HTTPException, Body
 13 |     from fastapi.routing import APIRoute
 14 |     from pydantic import BaseModel
 15 |     from starlette.responses import JSONResponse
 16 | 
 17 |     _serve_dependencies_installed = True
 18 | except (ImportError, AttributeError):
 19 |     BaseModel = object
 20 | 
 21 |     def Body(*x, **y):
 22 |         pass
 23 | 
 24 |     _serve_dependencies_installed = False
 25 | 
 26 | 
 27 | logger = logging.getLogger("transformers-cli/serving")
 28 | 
 29 | 
 30 | def serve_command_factory(args: Namespace):
 31 |     """
 32 |     Factory function used to instantiate serving server from provided command line arguments.
 33 |     :return: ServeCommand
 34 |     """
 35 |     nlp = pipeline(
 36 |         task=args.task,
 37 |         model=args.model if args.model else None,
 38 |         config=args.config,
 39 |         tokenizer=args.tokenizer,
 40 |         device=args.device,
 41 |     )
 42 |     return ServeCommand(nlp, args.host, args.port, args.workers)
 43 | 
 44 | 
 45 | class ServeModelInfoResult(BaseModel):
 46 |     """
 47 |     Expose model information
 48 |     """
 49 | 
 50 |     infos: dict
 51 | 
 52 | 
 53 | class ServeTokenizeResult(BaseModel):
 54 |     """
 55 |     Tokenize result model
 56 |     """
 57 | 
 58 |     tokens: List[str]
 59 |     tokens_ids: Optional[List[int]]
 60 | 
 61 | 
 62 | class ServeDeTokenizeResult(BaseModel):
 63 |     """
 64 |     DeTokenize result model
 65 |     """
 66 | 
 67 |     text: str
 68 | 
 69 | 
 70 | class ServeForwardResult(BaseModel):
 71 |     """
 72 |     Forward result model
 73 |     """
 74 | 
 75 |     output: Any
 76 | 
 77 | 
 78 | class ServeCommand(BaseTransformersCLICommand):
 79 |     @staticmethod
 80 |     def register_subcommand(parser: ArgumentParser):
 81 |         """
 82 |         Register this command to argparse so it's available for the transformer-cli
 83 |         :param parser: Root parser to register command-specific arguments
 84 |         :return:
 85 |         """
 86 |         serve_parser = parser.add_parser(
 87 |             "serve", help="CLI tool to run inference requests through REST and GraphQL endpoints."
 88 |         )
 89 |         serve_parser.add_argument(
 90 |             "--task", type=str, choices=SUPPORTED_TASKS.keys(), help="The task to run the pipeline on"
 91 |         )
 92 |         serve_parser.add_argument("--host", type=str, default="localhost", help="Interface the server will listen on.")
 93 |         serve_parser.add_argument("--port", type=int, default=8888, help="Port the serving will listen to.")
 94 |         serve_parser.add_argument("--workers", type=int, default=1, help="Number of http workers")
 95 |         serve_parser.add_argument("--model", type=str, help="Model's name or path to stored model.")
 96 |         serve_parser.add_argument("--config", type=str, help="Model's config name or path to stored model.")
 97 |         serve_parser.add_argument("--tokenizer", type=str, help="Tokenizer name to use.")
 98 |         serve_parser.add_argument(
 99 |             "--device",
100 |             type=int,
101 |             default=-1,
102 |             help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
103 |         )
104 |         serve_parser.set_defaults(func=serve_command_factory)
105 | 
106 |     def __init__(self, pipeline: Pipeline, host: str, port: int, workers: int):
107 | 
108 |         self._pipeline = pipeline
109 | 
110 |         self.host = host
111 |         self.port = port
112 |         self.workers = workers
113 | 
114 |         if not _serve_dependencies_installed:
115 |             raise RuntimeError(
116 |                 "Using serve command requires FastAPI and unicorn. "
117 |                 'Please install transformers with [serving]: pip install "transformers[serving]".'
118 |                 "Or install FastAPI and unicorn separately."
119 |             )
120 |         else:
121 |             logger.info("Serving model over {}:{}".format(host, port))
122 |             self._app = FastAPI(
123 |                 routes=[
124 |                     APIRoute(
125 |                         "/",
126 |                         self.model_info,
127 |                         response_model=ServeModelInfoResult,
128 |                         response_class=JSONResponse,
129 |                         methods=["GET"],
130 |                     ),
131 |                     APIRoute(
132 |                         "/tokenize",
133 |                         self.tokenize,
134 |                         response_model=ServeTokenizeResult,
135 |                         response_class=JSONResponse,
136 |                         methods=["POST"],
137 |                     ),
138 |                     APIRoute(
139 |                         "/detokenize",
140 |                         self.detokenize,
141 |                         response_model=ServeDeTokenizeResult,
142 |                         response_class=JSONResponse,
143 |                         methods=["POST"],
144 |                     ),
145 |                     APIRoute(
146 |                         "/forward",
147 |                         self.forward,
148 |                         response_model=ServeForwardResult,
149 |                         response_class=JSONResponse,
150 |                         methods=["POST"],
151 |                     ),
152 |                 ],
153 |                 timeout=600,
154 |             )
155 | 
156 |     def run(self):
157 |         run(self._app, host=self.host, port=self.port, workers=self.workers)
158 | 
159 |     def model_info(self):
160 |         return ServeModelInfoResult(infos=vars(self._pipeline.model.config))
161 | 
162 |     def tokenize(self, text_input: str = Body(None, embed=True), return_ids: bool = Body(False, embed=True)):
163 |         """
164 |         Tokenize the provided input and eventually returns corresponding tokens id:
165 |         - **text_input**: String to tokenize
166 |         - **return_ids**: Boolean flags indicating if the tokens have to be converted to their integer mapping.
167 |         """
168 |         try:
169 |             tokens_txt = self._pipeline.tokenizer.tokenize(text_input)
170 | 
171 |             if return_ids:
172 |                 tokens_ids = self._pipeline.tokenizer.convert_tokens_to_ids(tokens_txt)
173 |                 return ServeTokenizeResult(tokens=tokens_txt, tokens_ids=tokens_ids)
174 |             else:
175 |                 return ServeTokenizeResult(tokens=tokens_txt)
176 | 
177 |         except Exception as e:
178 |             raise HTTPException(status_code=500, detail={"model": "", "error": str(e)})
179 | 
180 |     def detokenize(
181 |         self,
182 |         tokens_ids: List[int] = Body(None, embed=True),
183 |         skip_special_tokens: bool = Body(False, embed=True),
184 |         cleanup_tokenization_spaces: bool = Body(True, embed=True),
185 |     ):
186 |         """
187 |         Detokenize the provided tokens ids to readable text:
188 |         - **tokens_ids**: List of tokens ids
189 |         - **skip_special_tokens**: Flag indicating to not try to decode special tokens
190 |         - **cleanup_tokenization_spaces**: Flag indicating to remove all leading/trailing spaces and intermediate ones.
191 |         """
192 |         try:
193 |             decoded_str = self._pipeline.tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces)
194 |             return ServeDeTokenizeResult(model="", text=decoded_str)
195 |         except Exception as e:
196 |             raise HTTPException(status_code=500, detail={"model": "", "error": str(e)})
197 | 
198 |     async def forward(self, inputs=Body(None, embed=True)):
199 |         """
200 |         **inputs**:
201 |         **attention_mask**:
202 |         **tokens_type_ids**:
203 |         """
204 | 
205 |         # Check we don't have empty string
206 |         if len(inputs) == 0:
207 |             return ServeForwardResult(output=[], attention=[])
208 | 
209 |         try:
210 |             # Forward through the model
211 |             output = self._pipeline(inputs)
212 |             return ServeForwardResult(output=output)
213 |         except Exception as e:
214 |             raise HTTPException(500, {"error": str(e)})
215 | 


--------------------------------------------------------------------------------
/src/transformers/modeling_tf_transfo_xl_utilities.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ A TF 2.0 Adaptive Softmax for Transformer XL model.
 17 | """
 18 | 
 19 | 
 20 | import tensorflow as tf
 21 | 
 22 | from .modeling_tf_utils import shape_list
 23 | 
 24 | 
 25 | class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
 26 |     def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs):
 27 |         super().__init__(**kwargs)
 28 | 
 29 |         self.vocab_size = vocab_size
 30 |         self.d_embed = d_embed
 31 |         self.d_proj = d_proj
 32 | 
 33 |         self.cutoffs = cutoffs + [vocab_size]
 34 |         self.cutoff_ends = [0] + self.cutoffs
 35 |         self.div_val = div_val
 36 | 
 37 |         self.shortlist_size = self.cutoffs[0]
 38 |         self.n_clusters = len(self.cutoffs) - 1
 39 |         self.head_size = self.shortlist_size + self.n_clusters
 40 |         self.keep_order = keep_order
 41 | 
 42 |         self.out_layers = []
 43 |         self.out_projs = []
 44 | 
 45 |     def build(self, input_shape):
 46 |         if self.n_clusters > 0:
 47 |             self.cluster_weight = self.add_weight(
 48 |                 shape=(self.n_clusters, self.d_embed), initializer="zeros", trainable=True, name="cluster_weight"
 49 |             )
 50 |             self.cluster_bias = self.add_weight(
 51 |                 shape=(self.n_clusters,), initializer="zeros", trainable=True, name="cluster_bias"
 52 |             )
 53 | 
 54 |         if self.div_val == 1:
 55 |             for i in range(len(self.cutoffs)):
 56 |                 if self.d_proj != self.d_embed:
 57 |                     weight = self.add_weight(
 58 |                         shape=(self.d_embed, self.d_proj),
 59 |                         initializer="zeros",
 60 |                         trainable=True,
 61 |                         name="out_projs_._{}".format(i),
 62 |                     )
 63 |                     self.out_projs.append(weight)
 64 |                 else:
 65 |                     self.out_projs.append(None)
 66 |                 weight = self.add_weight(
 67 |                     shape=(self.vocab_size, self.d_embed,),
 68 |                     initializer="zeros",
 69 |                     trainable=True,
 70 |                     name="out_layers_._{}_._weight".format(i),
 71 |                 )
 72 |                 bias = self.add_weight(
 73 |                     shape=(self.vocab_size,),
 74 |                     initializer="zeros",
 75 |                     trainable=True,
 76 |                     name="out_layers_._{}_._bias".format(i),
 77 |                 )
 78 |                 self.out_layers.append((weight, bias))
 79 |         else:
 80 |             for i in range(len(self.cutoffs)):
 81 |                 l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
 82 |                 d_emb_i = self.d_embed // (self.div_val ** i)
 83 | 
 84 |                 weight = self.add_weight(
 85 |                     shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name="out_projs_._{}".format(i)
 86 |                 )
 87 |                 self.out_projs.append(weight)
 88 |                 weight = self.add_weight(
 89 |                     shape=(r_idx - l_idx, d_emb_i,),
 90 |                     initializer="zeros",
 91 |                     trainable=True,
 92 |                     name="out_layers_._{}_._weight".format(i),
 93 |                 )
 94 |                 bias = self.add_weight(
 95 |                     shape=(r_idx - l_idx,),
 96 |                     initializer="zeros",
 97 |                     trainable=True,
 98 |                     name="out_layers_._{}_._bias".format(i),
 99 |                 )
100 |                 self.out_layers.append((weight, bias))
101 |         super().build(input_shape)
102 | 
103 |     @staticmethod
104 |     def _logit(x, W, b, proj=None):
105 |         y = x
106 |         if proj is not None:
107 |             y = tf.einsum("ibd,ed->ibe", y, proj)
108 |         return tf.einsum("ibd,nd->ibn", y, W) + b
109 | 
110 |     @staticmethod
111 |     def _gather_logprob(logprob, target):
112 |         lp_size = shape_list(logprob)
113 |         r = tf.range(lp_size[0])
114 |         idx = tf.stack([r, target], 1)
115 |         return tf.gather_nd(logprob, idx)
116 | 
117 |     def call(self, inputs, return_mean=True, training=False):
118 |         hidden, target = inputs
119 |         head_logprob = 0
120 |         if self.n_clusters == 0:
121 |             output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
122 |             if target is not None:
123 |                 loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
124 |             out = tf.nn.log_softmax(output, axis=-1)
125 |         else:
126 |             hidden_sizes = shape_list(hidden)
127 |             out = []
128 |             loss = tf.zeros(hidden_sizes[:2], dtype=tf.float32)
129 |             for i in range(len(self.cutoffs)):
130 |                 l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
131 |                 if target is not None:
132 |                     mask = (target >= l_idx) & (target < r_idx)
133 |                     mask_idx = tf.where(mask)
134 |                     cur_target = tf.boolean_mask(target, mask) - l_idx
135 | 
136 |                 if self.div_val == 1:
137 |                     cur_W = self.out_layers[0][0][l_idx:r_idx]
138 |                     cur_b = self.out_layers[0][1][l_idx:r_idx]
139 |                 else:
140 |                     cur_W = self.out_layers[i][0]
141 |                     cur_b = self.out_layers[i][1]
142 | 
143 |                 if i == 0:
144 |                     cur_W = tf.concat([cur_W, self.cluster_weight], 0)
145 |                     cur_b = tf.concat([cur_b, self.cluster_bias], 0)
146 | 
147 |                     head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0])
148 |                     head_logprob = tf.nn.log_softmax(head_logit)
149 |                     out.append(head_logprob[..., : self.cutoffs[0]])
150 |                     if target is not None:
151 |                         cur_head_logprob = tf.boolean_mask(head_logprob, mask)
152 |                         cur_logprob = self._gather_logprob(cur_head_logprob, cur_target)
153 |                 else:
154 |                     tail_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[i])
155 |                     tail_logprob = tf.nn.log_softmax(tail_logit)
156 |                     cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
157 |                     logprob_i = head_logprob[..., cluster_prob_idx, None] + tail_logprob
158 |                     out.append(logprob_i)
159 |                     if target is not None:
160 |                         cur_head_logprob = tf.boolean_mask(head_logprob, mask)
161 |                         cur_tail_logprob = tf.boolean_mask(tail_logprob, mask)
162 |                         cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target)
163 |                         cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1]
164 |                 if target is not None:
165 |                     loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(shape_list(loss), dtype=tf.int64))
166 |             out = tf.concat(out, axis=-1)
167 | 
168 |         if target is not None:
169 |             if return_mean:
170 |                 loss = tf.reduce_mean(loss)
171 |             # Add the training-time loss value to the layer using `self.add_loss()`.
172 |             self.add_loss(loss)
173 | 
174 |             # Log the loss as a metric (we could log arbitrary metrics,
175 |             # including different metrics for training and inference.
176 |             self.add_metric(loss, name=self.name, aggregation="mean" if return_mean else "")
177 | 
178 |         return out
179 | 


--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | source init.sh
  4 | 
  5 | MODEL_NAME_OR_PATH='xlm-roberta-large'
  6 | 
  7 | usage()
  8 | {
  9 | cat << EOF
 10 | usage: $0 options
 11 | OPTIONS:
 12 |         -h      Show the help and exit
 13 |         -n      Experiment name for saving to output directory
 14 |         -m      Pretrained model name or path
 15 |         -g      gpus to use, default is to use all GPUs
 16 |         -t      task to train
 17 |         -x      For convinent usage
 18 | EOF
 19 | }
 20 | 
 21 | while getopts "h:m:n:g:t:x:" opt
 22 | do
 23 |         case $opt in
 24 |             h)
 25 |                 usage
 26 |                 exit 1
 27 |                 ;;
 28 | 		        n)
 29 | 			          EXP_NAME=$OPTARG
 30 | 			          ;;
 31 | 		        m)
 32 | 			          MODEL_NAME_OR_PATH=${OPTARG}
 33 | 			          ;;
 34 | 		        g)
 35 | 			          N_GPU=$OPTARG
 36 | 			          ;;
 37 | 		        t)
 38 | 			          TASK=$OPTARG
 39 | 			          ;;
 40 |             x)
 41 |                 OTHER_ARGS=$OPTARG
 42 |                 ;;
 43 |         esac
 44 | done
 45 | 
 46 | DATA_DIR=$DATA_ROOT/data_raw
 47 | if [[ ! -d $DATA_DIR ]]; then
 48 | 	  echo "$DATA_DIR not exist"
 49 | 	  exit 1
 50 | fi
 51 | 
 52 | 
 53 | OUTPUT_DIR=$DATA_ROOT/outputs/${EXP_NAME:-debug}
 54 | mkdir -p $OUTPUT_DIR
 55 | 
 56 | xnli() {
 57 |     python -m torch.distributed.launch --nproc_per_node=$N_GPU --master_port=$RANDOM ./examples/run_xcls.py \
 58 |        --task_name xnli \
 59 |        --data_dir $DATA_DIR/xnli \
 60 |        --model_type filter \
 61 |        --model_name_or_path $MODEL_NAME_OR_PATH \
 62 |        --language ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh \
 63 |        --train_language en \
 64 |        --do_train \
 65 |        --eval_splits valid \
 66 |        --fp16 \
 67 |        --per_gpu_train_batch_size 8 \
 68 |        --learning_rate 3e-6 \
 69 |        --num_train_epochs 5 \
 70 |        --max_seq_length 256 \
 71 |        --output_dir $OUTPUT_DIR \
 72 |        --log_dir $OUTPUT_DIR \
 73 |        --overwrite_output_dir \
 74 |        --logging_steps 500 \
 75 |        --logging_each_epoch \
 76 |        --per_gpu_eval_batch_size 64 \
 77 |        --eval_all_checkpoints \
 78 |        --filter_m 1 --filter_k 1 \
 79 |        ${OTHER_ARGS}
 80 | }
 81 | 
 82 | pawsx() {
 83 |     python -m torch.distributed.launch --nproc_per_node=$N_GPU --master_port=$RANDOM ./examples/run_xcls.py \
 84 |        --task_name pawsx \
 85 |        --data_dir $DATA_DIR/pawsx \
 86 |        --model_type filter \
 87 |        --language de,en,es,fr,ja,ko,zh \
 88 |        --model_name_or_path $MODEL_NAME_OR_PATH \
 89 |        --train_language en \
 90 |        --do_train \
 91 |        --eval_splits valid \
 92 |        --per_gpu_train_batch_size 4 \
 93 |        --learning_rate 1e-5 \
 94 |        --num_train_epochs 4 \
 95 |        --max_seq_length 256 \
 96 |        --output_dir $OUTPUT_DIR \
 97 |        --log_dir $OUTPUT_DIR \
 98 |        --overwrite_output_dir \
 99 |        --logging_steps 500 \
100 |        --per_gpu_eval_batch_size 64 \
101 |        --logging_each_epoch \
102 |        --filter_m 1 --filter_k 1 \
103 |        ${OTHER_ARGS}
104 | }
105 | 
106 | mlqa() {
107 |     # mlqa and xquad share the same training set
108 |     python -m torch.distributed.launch --nproc_per_node=$N_GPU --master_port=$RANDOM ./examples/run_xqa.py \
109 |            --task_name mlqa \
110 |            --data_dir $DATA_DIR \
111 |            --model_type filter \
112 |            --model_name_or_path $MODEL_NAME_OR_PATH \
113 |            --language ar,de,en,es,hi,vi,zh \
114 |            --train_language en \
115 |            --do_train \
116 |            --eval_splits 'dev' \
117 |            --do_lower_case \
118 |            --per_gpu_train_batch_size 4 \
119 |            --gradient_accumulation_steps 2 \
120 |            --learning_rate 5e-6 \
121 |            --per_gpu_eval_batch_size 64 \
122 |            --num_train_epochs 2.0 \
123 |            --max_seq_length 384 \
124 |            --doc_stride 128  \
125 |            --output_dir $OUTPUT_DIR \
126 |            --log_dir $OUTPUT_DIR \
127 |            --logging_each_epoch \
128 |            --evaluate_during_training \
129 |            --threads 8 \
130 |            --filter_m 1 --filter_k 20 \
131 |            ${OTHER_ARGS}
132 | }
133 | 
134 | 
135 | xquad() {
136 |     python -m torch.distributed.launch --nproc_per_node=$N_GPU --master_port=$RANDOM ./examples/run_xqa.py \
137 |        --task_name xquad \
138 |        --data_dir $DATA_DIR/ \
139 |        --model_type filter \
140 |        --model_name_or_path $MODEL_NAME_OR_PATH \
141 |        --language ar,de,el,en,es,hi,ru,th,tr,vi,zh \
142 |        --train_language en \
143 |        --do_train \
144 |        --eval_splits 'dev' \
145 |        --do_lower_case \
146 |        --per_gpu_train_batch_size 4 \
147 |        --learning_rate 5e-6 \
148 |        --per_gpu_eval_batch_size 64 \
149 |        --num_train_epochs 2.0 \
150 |        --max_seq_length 384 \
151 |        --doc_stride 128  \
152 |        --output_dir $OUTPUT_DIR \
153 |        --log_dir $OUTPUT_DIR \
154 |        --logging_each_epoch \
155 |        --eval_all_checkpoints \
156 |        --threads 8 \
157 |        --filter_m 1 --filter_k 20 \
158 |        ${OTHER_ARGS}
159 | }
160 | 
161 | tydiqa() {
162 |     python -m torch.distributed.launch --nproc_per_node=$N_GPU --master_port=$RANDOM ./examples/run_xqa.py \
163 |        --task_name tydiqa \
164 |        --data_dir $DATA_DIR \
165 |        --model_type filter \
166 |        --model_name_or_path $MODEL_NAME_OR_PATH \
167 |        --language ar,bn,en,fi,id,ko,ru,sw,te \
168 |        --train_language en \
169 |        --do_train \
170 |        --do_lower_case \
171 |        --eval_splits dev \
172 |        --per_gpu_train_batch_size 4 \
173 |        --learning_rate 1e-5 \
174 |        --per_gpu_eval_batch_size 64 \
175 |        --num_train_epochs 4.0 \
176 |        --logging_each_epoch \
177 |        --max_seq_length 384 \
178 |        --doc_stride 128  \
179 |        --output_dir $OUTPUT_DIR \
180 |        --log_dir $OUTPUT_DIR \
181 |        --overwrite_output_dir \
182 |        --eval_all_checkpoints \
183 |        --threads 8 \
184 |        --filter_m 1 --filter_k 20 \
185 |        ${OTHER_ARGS}
186 | }
187 | 
188 | udpos() {
189 |     python -m torch.distributed.launch --nproc_per_node=$N_GPU --master_port=$RANDOM ./examples/run_xtreme_tag.py \
190 |            --task_name udpos \
191 |            --data_dir $DATA_ROOT/udpos/udpos_processed_maxlen128 \
192 |            --model_type filter \
193 |            --model_name_or_path $MODEL_NAME_OR_PATH \
194 |            --labels $DATA_ROOT/udpos/udpos_processed_maxlen128/labels.txt \
195 |            --language af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh \
196 |            --train_language en \
197 |            --do_train \
198 |            --eval_splits dev \
199 |            --max_seq_length 128 \
200 |            --num_train_epochs 20 \
201 |            --per_gpu_train_batch_size 8 \
202 |            --per_gpu_eval_batch_size 64 \
203 |            --learning_rate 5e-6 \
204 |            --save_steps 1000 \
205 |            --output_dir $OUTPUT_DIR \
206 |            --log_dir $OUTPUT_DIR \
207 |            --eval_all_checkpoints \
208 |            --filter_m 1 --filter_k 1 \
209 |            ${OTHER_ARGS}
210 | }
211 | 
212 | panx() {
213 |     python -m torch.distributed.launch --nproc_per_node=${N_GPU:-8} --master_port=$RANDOM ./examples/run_tag.py \
214 |            --task_name panx \
215 |            --data_dir $DATA_ROOT/panx/panx_processed_maxlen128 \
216 |            --labels $DATA_ROOT/panx/panx_processed_maxlen128/labels.txt \
217 |            --model_type filter \
218 |            --model_name_or_path $MODEL_NAME_OR_PATH \
219 |            --language ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu \
220 |            --train_language en \
221 |            --do_train \
222 |            --eval_splits dev \
223 |            --max_seq_length 128 \
224 |            --num_train_epochs 20 \
225 |            --per_gpu_train_batch_size 8 \
226 |            --per_gpu_eval_batch_size 64 \
227 |            --learning_rate 5e-6 \
228 |            --save_steps 1000 \
229 |            --eval_all_checkpoints \
230 |            --log_dir $OUTPUT_DIR \
231 |            --output_dir $OUTPUT_DIR \
232 |            --filter_m 1 --filter_k 1 \
233 |            ${OTHER_ARGS}
234 | }
235 | 
236 | for task in xnli pawsx mlqa xquad tydiqa udpos panx
237 | do
238 |         if [[ ${TASK:-"xnli"} == $task ]]; then
239 |                 $task
240 |         fi
241 | done
242 | 


--------------------------------------------------------------------------------