├── third_party └── ud-conversion-tools │ ├── lib │ └── __init__.py │ └── conllu_to_conll.py ├── init.sh ├── src └── transformers │ ├── commands │ ├── __init__.py │ ├── download.py │ ├── env.py │ ├── run.py │ ├── train.py │ ├── convert.py │ └── serving.py │ ├── data │ ├── processors │ │ ├── __init__.py │ │ └── xnli.py │ ├── __init__.py │ └── metrics │ │ ├── __init__.py │ │ └── mlqa_evaluation_v1.py │ ├── tokenization_bart.py │ ├── activations.py │ ├── configuration_mmbt.py │ ├── configuration_camembert.py │ ├── utils_encoder_decoder.py │ ├── configuration_xlm_roberta.py │ ├── convert_t5_original_tf_checkpoint_to_pytorch.py │ ├── convert_bert_original_tf_checkpoint_to_pytorch.py │ ├── convert_albert_original_tf_checkpoint_to_pytorch.py │ ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py │ ├── convert_openai_original_tf_checkpoint_to_pytorch.py │ ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py │ ├── configuration_roberta.py │ ├── tokenization_distilbert.py │ ├── configuration_bart.py │ ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py │ ├── configuration_t5.py │ ├── convert_bert_pytorch_checkpoint_to_original_tf.py │ ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py │ ├── modeling_tf_camembert.py │ ├── modeling_tf_xlm_roberta.py │ ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py │ ├── tokenization_flaubert.py │ ├── modeling_camembert.py │ ├── configuration_ctrl.py │ ├── hf_api.py │ ├── configuration_distilbert.py │ ├── tokenization_t5.py │ ├── configuration_albert.py │ ├── optimization.py │ └── modeling_tf_transfo_xl_utilities.py ├── scripts ├── download_model.sh ├── preprocess_panx.sh └── preprocess_udpos.sh ├── LICENSE ├── transformers-cli ├── dockers └── Dockerfile ├── README.md ├── setup.py ├── eval.sh └── train.sh /third_party/ud-conversion-tools/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export DATA_ROOT=/ssd/data 4 | 5 | export N_GPU=`nvidia-smi -L | wc -l` 6 | export PYTORCH_PRETRAINED_BERT_CACHE=$DATA_ROOT/pretrained-cache 7 | export OMP_NUM_THREADS=4 8 | export MKL_NUM_THREADS=4 9 | 10 | pip install --user --editable ./ 11 | mkdir -p $DATA_ROOT/outputs 12 | -------------------------------------------------------------------------------- /src/transformers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from argparse import ArgumentParser 3 | 4 | 5 | class BaseTransformersCLICommand(ABC): 6 | @staticmethod 7 | @abstractmethod 8 | def register_subcommand(parser: ArgumentParser): 9 | raise NotImplementedError() 10 | 11 | @abstractmethod 12 | def run(self): 13 | raise NotImplementedError() 14 | -------------------------------------------------------------------------------- /scripts/download_model.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | DOWNLOAD=$1 5 | 6 | BLOB='https://convaisharables.blob.core.windows.net/filter/data/outputs/phase2' 7 | 8 | download_model() { 9 | mkdir -p $DOWNLOAD/outputs/phase2/$task 10 | for file in config.json pytorch_model.bin sentencepiece.bpe.model special_tokens_map.json tokenizer_config.json; do 11 | wget $BLOB/$task/$file -O $DOWNLOAD/outputs/phase2/$task/$file 12 | done 13 | } 14 | -------------------------------------------------------------------------------- /src/transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .xglue import xglue_convert_examples_to_features, xglue_output_modes, xglue_processors, xglue_tasks_num_labels 6 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels 7 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features 8 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor 9 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels 10 | -------------------------------------------------------------------------------- /src/transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .metrics import is_sklearn_available 6 | from .processors import ( 7 | DataProcessor, 8 | InputExample, 9 | InputFeatures, 10 | SingleSentenceClassificationProcessor, 11 | SquadExample, 12 | SquadFeatures, 13 | SquadV1Processor, 14 | SquadV2Processor, 15 | glue_convert_examples_to_features, 16 | glue_output_modes, 17 | glue_processors, 18 | glue_tasks_num_labels, 19 | 20 | xglue_convert_examples_to_features, 21 | xglue_output_modes, 22 | xglue_processors, 23 | xglue_tasks_num_labels, 24 | 25 | squad_convert_examples_to_features, 26 | xnli_output_modes, 27 | xnli_processors, 28 | xnli_tasks_num_labels, 29 | ) 30 | 31 | 32 | if is_sklearn_available(): 33 | from .metrics import glue_compute_metrics, xnli_compute_metrics, xglue_compute_metrics 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 yuwfan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /transformers-cli: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from argparse import ArgumentParser 3 | 4 | from transformers.commands.convert import ConvertCommand 5 | from transformers.commands.download import DownloadCommand 6 | from transformers.commands.env import EnvironmentCommand 7 | from transformers.commands.run import RunCommand 8 | from transformers.commands.serving import ServeCommand 9 | from transformers.commands.user import UserCommands 10 | 11 | if __name__ == '__main__': 12 | parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli []') 13 | commands_parser = parser.add_subparsers(help='transformers-cli command helpers') 14 | 15 | # Register commands 16 | ConvertCommand.register_subcommand(commands_parser) 17 | DownloadCommand.register_subcommand(commands_parser) 18 | EnvironmentCommand.register_subcommand(commands_parser) 19 | RunCommand.register_subcommand(commands_parser) 20 | ServeCommand.register_subcommand(commands_parser) 21 | UserCommands.register_subcommand(commands_parser) 22 | 23 | # Let's go 24 | args = parser.parse_args() 25 | 26 | if not hasattr(args, 'func'): 27 | parser.print_help() 28 | exit(1) 29 | 30 | # Run 31 | service = args.func(args) 32 | service.run() 33 | -------------------------------------------------------------------------------- /src/transformers/commands/download.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from transformers.commands import BaseTransformersCLICommand 4 | 5 | 6 | def download_command_factory(args): 7 | return DownloadCommand(args.model, args.cache_dir, args.force) 8 | 9 | 10 | class DownloadCommand(BaseTransformersCLICommand): 11 | @staticmethod 12 | def register_subcommand(parser: ArgumentParser): 13 | download_parser = parser.add_parser("download") 14 | download_parser.add_argument( 15 | "--cache-dir", type=str, default=None, help="Path to location to store the models" 16 | ) 17 | download_parser.add_argument( 18 | "--force", action="store_true", help="Force the model to be download even if already in cache-dir" 19 | ) 20 | download_parser.add_argument("model", type=str, help="Name of the model to download") 21 | download_parser.set_defaults(func=download_command_factory) 22 | 23 | def __init__(self, model: str, cache: str, force: bool): 24 | self._model = model 25 | self._cache = cache 26 | self._force = force 27 | 28 | def run(self): 29 | from transformers import AutoModel, AutoTokenizer 30 | 31 | AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 32 | AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 33 | -------------------------------------------------------------------------------- /src/transformers/tokenization_bart.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .tokenization_roberta import RobertaTokenizer 17 | 18 | 19 | # vocab and merges same as roberta 20 | vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json" 21 | merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt" 22 | _all_bart_models = ["bart-large", "bart-large-mnli", "bart-large-cnn"] 23 | 24 | 25 | class BartTokenizer(RobertaTokenizer): 26 | # merges and vocab same as Roberta 27 | max_model_input_sizes = {m: 1024 for m in _all_bart_models} 28 | pretrained_vocab_files_map = { 29 | "vocab_file": {m: vocab_url for m in _all_bart_models}, 30 | "merges_file": {m: merges_url for m in _all_bart_models}, 31 | } 32 | -------------------------------------------------------------------------------- /src/transformers/activations.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | 7 | def swish(x): 8 | return x * torch.sigmoid(x) 9 | 10 | 11 | def _gelu_python(x): 12 | """ Original Implementation of the gelu activation function in Google Bert repo when initially created. 13 | For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 14 | 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 15 | This is now written in C in torch.nn.functional 16 | Also see https://arxiv.org/abs/1606.08415 17 | """ 18 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 19 | 20 | 21 | if torch.__version__ < "1.4.0": 22 | gelu = _gelu_python 23 | else: 24 | gelu = F.gelu 25 | 26 | 27 | def gelu_new(x): 28 | """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). 29 | Also see https://arxiv.org/abs/1606.08415 30 | """ 31 | return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 32 | 33 | 34 | ACT2FN = { 35 | "relu": F.relu, 36 | "swish": swish, 37 | "gelu": gelu, 38 | "tanh": F.tanh, 39 | "gelu_new": gelu_new, 40 | } 41 | 42 | 43 | def get_activation(activation_string): 44 | if activation_string in ACT2FN: 45 | return ACT2FN[activation_string] 46 | else: 47 | raise KeyError( 48 | "function {} not found in ACT2FN mapping {} or torch.nn.functional".format( 49 | activation_string, list(ACT2FN.keys()) 50 | ) 51 | ) 52 | -------------------------------------------------------------------------------- /src/transformers/configuration_mmbt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Copyright (c) HuggingFace Inc. team. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ MMBT configuration """ 17 | 18 | 19 | import logging 20 | 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class MMBTConfig(object): 26 | """Configuration class to store the configuration of a `MMBT Model`. 27 | 28 | Args: 29 | config (:obj:`~transformers.PreTrainedConfig`): 30 | Config of the underlying Transformer models. Its values are 31 | copied over to use a single config. 32 | num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`): 33 | Size of final Linear layer for classification. 34 | modal_hidden_size (:obj:`int`, optional, defautls to 2048): 35 | Embedding dimension of the non-text modality encoder. 36 | """ 37 | 38 | def __init__(self, config, num_labels=None, modal_hidden_size=2048): 39 | self.__dict__ = config.__dict__ 40 | self.modal_hidden_size = modal_hidden_size 41 | if num_labels: 42 | self.num_labels = num_labels 43 | -------------------------------------------------------------------------------- /src/transformers/configuration_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ CamemBERT configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_roberta import RobertaConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json", 28 | "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json", 29 | "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json", 30 | } 31 | 32 | 33 | class CamembertConfig(RobertaConfig): 34 | """ 35 | This class overrides :class:`~transformers.RobertaConfig`. Please check the 36 | superclass for the appropriate documentation alongside usage examples. 37 | """ 38 | 39 | pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 40 | model_type = "camembert" 41 | -------------------------------------------------------------------------------- /scripts/preprocess_panx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-bert-base-multilingual-cased} 18 | DATA_DIR=${2:-"$REPO/download/"} 19 | 20 | TASK='panx' 21 | MAXL=128 22 | LANGS="ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu" 23 | LC="" 24 | if [ $MODEL == "bert-base-multilingual-cased" ]; then 25 | MODEL_TYPE="bert" 26 | elif [ $MODEL == "xlm-mlm-100-1280" ] || [ $MODEL == "xlm-mlm-tlm-xnli15-1024" ]; then 27 | MODEL_TYPE="xlm" 28 | LC=" --do_lower_case" 29 | elif [ $MODEL == "xlm-roberta-large" ] || [ $MODEL == "xom-roberta-base" ]; then 30 | MODEL_TYPE="xlmr" 31 | fi 32 | SAVE_DIR="$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAXL}" 33 | mkdir -p $SAVE_DIR 34 | python3 $REPO/utils_preprocess.py \ 35 | --data_dir $DATA_DIR/$TASK/ \ 36 | --task panx_tokenize \ 37 | --model_name_or_path $MODEL \ 38 | --model_type $MODEL_TYPE \ 39 | --max_len $MAXL \ 40 | --output_dir $SAVE_DIR \ 41 | --languages $LANGS $LC >> $SAVE_DIR/preprocess.log 42 | if [ ! -f $SAVE_DIR/labels.txt ]; then 43 | cat $SAVE_DIR/*/*.${MODEL} | cut -f 2 | grep -v "^$" | sort | uniq > $SAVE_DIR/labels.txt 44 | fi 45 | -------------------------------------------------------------------------------- /scripts/preprocess_udpos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-bert-base-multilingual-cased} 18 | DATA_DIR=${2:-"$REPO/download/"} 19 | 20 | TASK='udpos' 21 | MAXL=128 22 | LANGS='af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh' 23 | LC="" 24 | if [ $MODEL == "bert-base-multilingual-cased" ]; then 25 | MODEL_TYPE="bert" 26 | elif [ $MODEL == "xlm-mlm-100-1280" ] || [ $MODEL == "xlm-mlm-tlm-xnli15-1024" ]; then 27 | MODEL_TYPE="xlm" 28 | LC=" --do_lower_case" 29 | elif [ $MODEL == "xlm-roberta-large" ] || [ $MODEL == "xlm-roberta-base" ]; then 30 | MODEL_TYPE="xlmr" 31 | fi 32 | 33 | SAVE_DIR="$DATA_DIR/${TASK}/udpos_processed_maxlen${MAXL}" 34 | mkdir -p $SAVE_DIR 35 | python3 $REPO/utils_preprocess.py \ 36 | --data_dir $DATA_DIR/${TASK}/ \ 37 | --task udpos_tokenize \ 38 | --model_name_or_path $MODEL \ 39 | --model_type $MODEL_TYPE \ 40 | --max_len $MAXL \ 41 | --output_dir $SAVE_DIR \ 42 | --languages $LANGS $LC >> $SAVE_DIR/process.log 43 | if [ ! -f $SAVE_DIR/labels.txt ]; then 44 | echo "create label" 45 | cat $SAVE_DIR/*/*.${MODEL} | cut -f 2 | grep -v "^$" | sort | uniq > $SAVE_DIR/labels.txt 46 | fi 47 | -------------------------------------------------------------------------------- /src/transformers/utils_encoder_decoder.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Classes to support Encoder-Decoder architectures """ 16 | 17 | 18 | def prepare_encoder_decoder_model_kwargs(**kwargs): 19 | """ Prepare the encoder and decoder's keyword arguments. 20 | 21 | Keyword arguments come in 3 flavors: 22 | - encoder-specific (prefixed by `encoder_`) 23 | - decoder-specific (prefixed by `decoder_`) 24 | - those that apply to the model as whole. 25 | 26 | We let the specific kwargs override the common ones in case of 27 | conflict. 28 | """ 29 | 30 | kwargs_common = { 31 | argument: value 32 | for argument, value in kwargs.items() 33 | if not argument.startswith("encoder_") and not argument.startswith("decoder_") 34 | } 35 | if "input_ids" in kwargs_common: 36 | kwargs["encoder_input_ids"] = kwargs_common.pop("input_ids") 37 | 38 | decoder_kwargs = kwargs_common.copy() 39 | encoder_kwargs = kwargs_common.copy() 40 | encoder_kwargs.update( 41 | {argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")} 42 | ) 43 | decoder_kwargs.update( 44 | {argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")} 45 | ) 46 | decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None) 47 | return encoder_kwargs, decoder_kwargs 48 | -------------------------------------------------------------------------------- /dockers/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel 2 | 3 | RUN apt update && \ 4 | apt install -y bash \ 5 | wget \ 6 | ssh \ 7 | vim \ 8 | build-essential \ 9 | git \ 10 | curl \ 11 | zip \ 12 | unzip \ 13 | ca-certificates \ 14 | libopenblas-dev \ 15 | libomp-dev \ 16 | autoconf \ 17 | automake \ 18 | libtool \ 19 | locales \ 20 | python3 \ 21 | python3-pip && \ 22 | rm -rf /var/lib/apt/lists 23 | 24 | # uninstall Apex if present, twice to make absolutely sure :) 25 | RUN pip uninstall -y apex || : 26 | RUN pip uninstall -y apex || : 27 | RUN PWD_DIR=$(pwd) 28 | RUN cd $(mktemp -d) 29 | RUN git clone -q https://github.com/NVIDIA/apex.git 30 | RUN cd apex; git reset --hard de6378f5dae8fcf2879a4be8ecea8bbcb9e59d5; pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 31 | #RUN cd apex; pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" \ 32 | # --global-option="--deprecated_fused_adam" --global-option="--xentropy" \ 33 | # --global-option="--fast_multihead_attn" ./ 34 | 35 | RUN cd $(mktemp -d) 36 | RUN git clone https://github.com/neubig/kytea.git 37 | RUN cd kytea && autoreconf -i && ./configure && make && make install 38 | RUN pip install kytea 39 | 40 | RUN pip install tensorboardX six numpy tqdm path.py pandas scikit-learn lmdb pyarrow py-lz4framed methodtools py-rouge pyrouge nltk seqeval sacremoses pythainlp jieba faiss urllib3==1.25.4 networkx==1.11 41 | 42 | #RUN cd $(mktemp -d) 43 | #RUN git clone https://github.com/pytorch/fairseq 44 | #RUN cd fairseq; pip install --editable ./ 45 | #RUN chmod -R 777 /opt/conda 46 | 47 | RUN cd $PWD_DIR 48 | RUN rm -rf /var/lib/apt/lists/* && locale-gen "en_US.UTF-8" 49 | ENV LANG en_US.UTF-8 50 | ENV LANGUAGE en_US:en 51 | ENV LC_ALL en_US.UTF-8 52 | 53 | CMD ["/bin/bash"] 54 | -------------------------------------------------------------------------------- /src/transformers/configuration_xlm_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XLM-RoBERTa configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_roberta import RobertaConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json", 28 | "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json", 29 | "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json", 30 | "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json", 31 | "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json", 32 | "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json", 33 | } 34 | 35 | 36 | class XLMRobertaConfig(RobertaConfig): 37 | """ 38 | This class overrides :class:`~transformers.RobertaConfig`. Please check the 39 | superclass for the appropriate documentation alongside usage examples. 40 | """ 41 | 42 | pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 43 | model_type = "xlm-roberta" 44 | -------------------------------------------------------------------------------- /src/transformers/commands/env.py: -------------------------------------------------------------------------------- 1 | import platform 2 | from argparse import ArgumentParser 3 | 4 | from transformers import __version__ as version 5 | from transformers import is_tf_available, is_torch_available 6 | from transformers.commands import BaseTransformersCLICommand 7 | 8 | 9 | def info_command_factory(_): 10 | return EnvironmentCommand() 11 | 12 | 13 | class EnvironmentCommand(BaseTransformersCLICommand): 14 | @staticmethod 15 | def register_subcommand(parser: ArgumentParser): 16 | download_parser = parser.add_parser("env") 17 | download_parser.set_defaults(func=info_command_factory) 18 | 19 | def run(self): 20 | pt_version = "not installed" 21 | pt_cuda_available = "NA" 22 | if is_torch_available(): 23 | import torch 24 | 25 | pt_version = torch.__version__ 26 | pt_cuda_available = torch.cuda.is_available() 27 | 28 | tf_version = "not installed" 29 | tf_cuda_available = "NA" 30 | if is_tf_available(): 31 | import tensorflow as tf 32 | 33 | tf_version = tf.__version__ 34 | try: 35 | # deprecated in v2.1 36 | tf_cuda_available = tf.test.is_gpu_available() 37 | except AttributeError: 38 | # returns list of devices, convert to bool 39 | tf_cuda_available = bool(tf.config.list_physical_devices("GPU")) 40 | 41 | info = { 42 | "`transformers` version": version, 43 | "Platform": platform.platform(), 44 | "Python version": platform.python_version(), 45 | "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available), 46 | "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available), 47 | "Using GPU in script?": "", 48 | "Using distributed or parallel set-up in script?": "", 49 | } 50 | 51 | print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n") 52 | print(self.format_dict(info)) 53 | 54 | return info 55 | 56 | @staticmethod 57 | def format_dict(d): 58 | return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n" 59 | -------------------------------------------------------------------------------- /src/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert T5 checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import T5Config, T5Model, load_tf_weights_in_t5 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = T5Config.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = T5Model(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_t5(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained T5 model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = BertConfig.from_json_file(bert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = BertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_bert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--bert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained BERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert ALBERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = AlbertConfig.from_json_file(albert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = AlbertForMaskedLM(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_albert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--albert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained ALBERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /src/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): 30 | # Construct model 31 | if gpt2_config_file == "": 32 | config = GPT2Config() 33 | else: 34 | config = GPT2Config.from_json_file(gpt2_config_file) 35 | model = GPT2Model(config) 36 | 37 | # Load weights from numpy 38 | load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) 39 | 40 | # Save pytorch-model 41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 44 | torch.save(model.state_dict(), pytorch_weights_dump_path) 45 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 47 | f.write(config.to_json_string()) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | # Required parameters 53 | parser.add_argument( 54 | "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 55 | ) 56 | parser.add_argument( 57 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 58 | ) 59 | parser.add_argument( 60 | "--gpt2_config_file", 61 | default="", 62 | type=str, 63 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n" 64 | "This specifies the model architecture.", 65 | ) 66 | args = parser.parse_args() 67 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path) 68 | -------------------------------------------------------------------------------- /src/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): 30 | # Construct model 31 | if openai_config_file == "": 32 | config = OpenAIGPTConfig() 33 | else: 34 | config = OpenAIGPTConfig.from_json_file(openai_config_file) 35 | model = OpenAIGPTModel(config) 36 | 37 | # Load weights from numpy 38 | load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path) 39 | 40 | # Save pytorch-model 41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 44 | torch.save(model.state_dict(), pytorch_weights_dump_path) 45 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 47 | f.write(config.to_json_string()) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | # Required parameters 53 | parser.add_argument( 54 | "--openai_checkpoint_folder_path", 55 | default=None, 56 | type=str, 57 | required=True, 58 | help="Path to the TensorFlow checkpoint path.", 59 | ) 60 | parser.add_argument( 61 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 62 | ) 63 | parser.add_argument( 64 | "--openai_config_file", 65 | default="", 66 | type=str, 67 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n" 68 | "This specifies the model architecture.", 69 | ) 70 | args = parser.parse_args() 71 | convert_openai_checkpoint_to_pytorch( 72 | args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path 73 | ) 74 | -------------------------------------------------------------------------------- /third_party/ud-conversion-tools/conllu_to_conll.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from itertools import islice 3 | from pathlib import Path 4 | import argparse 5 | import sys, copy 6 | 7 | from lib.conll import CoNLLReader 8 | 9 | def main(): 10 | parser = argparse.ArgumentParser(description="""Convert conllu to conll format""") 11 | parser.add_argument('input', help="conllu file") 12 | parser.add_argument('output', help="target file", type=Path) 13 | parser.add_argument('--replace_subtokens_with_fused_forms', help="By default removes fused tokens", default=False, action="store_true") 14 | parser.add_argument('--remove_deprel_suffixes', help="Restrict deprels to the common universal subset, e.g. nmod:tmod becomes nmod", default=False, action="store_true") 15 | parser.add_argument('--remove_node_properties', help="space-separated list of node properties to remove: form, lemma, cpostag, postag, feats", choices=['form', 'lemma', 'cpostag','postag','feats'], metavar='prop', type=str, nargs='+') 16 | parser.add_argument('--lang', help="specify a language 2-letter code", default="default") 17 | parser.add_argument('--output_format', choices=['conll2006', 'conll2009', 'conllu'], default="conll2006") 18 | parser.add_argument('--remove_arabic_diacritics', help="remove Arabic short vowels", default=False, action="store_true") 19 | parser.add_argument('--print_comments',default=False,action="store_true") 20 | parser.add_argument('--print_fused_forms',default=False,action="store_true") 21 | 22 | args = parser.parse_args() 23 | 24 | if sys.version_info < (3,0): 25 | print("Sorry, requires Python 3.x.") #suggestion: install anaconda python 26 | sys.exit(1) 27 | 28 | POSRANKPRECEDENCEDICT = defaultdict(list) 29 | POSRANKPRECEDENCEDICT["default"] = "VERB NOUN PROPN PRON ADJ NUM ADV INTJ AUX ADP DET PART CCONJ SCONJ X PUNCT ".split(" ") 30 | # POSRANKPRECEDENCEDICT["de"] = "PROPN ADP DET ".split(" ") 31 | POSRANKPRECEDENCEDICT["es"] = "VERB AUX PRON ADP DET".split(" ") 32 | POSRANKPRECEDENCEDICT["fr"] = "VERB AUX PRON NOUN ADJ ADV ADP DET PART SCONJ CONJ".split(" ") 33 | POSRANKPRECEDENCEDICT["it"] = "VERB AUX ADV PRON ADP DET INTJ".split(" ") 34 | 35 | if args.lang in POSRANKPRECEDENCEDICT: 36 | current_pos_precedence_list = POSRANKPRECEDENCEDICT[args.lang] 37 | else: 38 | current_pos_precedence_list = POSRANKPRECEDENCEDICT["default"] 39 | 40 | cio = CoNLLReader() 41 | orig_treebank = cio.read_conll_u(args.input)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT) 42 | modif_treebank = copy.copy(orig_treebank) 43 | 44 | # As per Dec 2015 the args.lang variable is redundant once you have current_pos_precedence_list 45 | # We keep it for future modifications, i.e. any language-specific modules 46 | for s in modif_treebank: 47 | # print('sentence', s.get_sentence_as_string(printid=True)) 48 | s.filter_sentence_content(args.replace_subtokens_with_fused_forms, args.lang, current_pos_precedence_list,args.remove_node_properties,args.remove_deprel_suffixes,args.remove_arabic_diacritics) 49 | 50 | cio.write_conll(modif_treebank,args.output, args.output_format,print_fused_forms=args.print_fused_forms, print_comments=args.print_comments) 51 | 52 | if __name__ == "__main__": 53 | main() -------------------------------------------------------------------------------- /src/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import json 20 | import logging 21 | 22 | import numpy 23 | import torch 24 | 25 | from transformers import CONFIG_NAME, WEIGHTS_NAME 26 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES 27 | 28 | 29 | logging.basicConfig(level=logging.INFO) 30 | 31 | 32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path): 33 | # Load checkpoint 34 | chkpt = torch.load(xlm_checkpoint_path, map_location="cpu") 35 | 36 | state_dict = chkpt["model"] 37 | 38 | # We have the base model one level deeper than the original XLM repository 39 | two_levels_state_dict = {} 40 | for k, v in state_dict.items(): 41 | if "pred_layer" in k: 42 | two_levels_state_dict[k] = v 43 | else: 44 | two_levels_state_dict["transformer." + k] = v 45 | 46 | config = chkpt["params"] 47 | config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))) 48 | 49 | vocab = chkpt["dico_word2id"] 50 | vocab = dict((s + "" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items()) 51 | 52 | # Save pytorch-model 53 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 54 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 55 | pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"] 56 | 57 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 58 | torch.save(two_levels_state_dict, pytorch_weights_dump_path) 59 | 60 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 61 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 62 | f.write(json.dumps(config, indent=2) + "\n") 63 | 64 | print("Save vocab file to {}".format(pytorch_config_dump_path)) 65 | with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f: 66 | f.write(json.dumps(vocab, indent=2) + "\n") 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser() 71 | # Required parameters 72 | parser.add_argument( 73 | "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." 74 | ) 75 | parser.add_argument( 76 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 77 | ) 78 | args = parser.parse_args() 79 | convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path) 80 | -------------------------------------------------------------------------------- /src/transformers/data/processors/xnli.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XNLI utils (dataset loading and evaluation) """ 17 | 18 | 19 | import logging 20 | import os 21 | 22 | from .utils import DataProcessor, InputExample 23 | 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | class XnliProcessor(DataProcessor): 29 | """Processor for the XNLI dataset. 30 | Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207""" 31 | 32 | def __init__(self, language, train_language=None): 33 | self.language = language 34 | self.train_language = train_language 35 | 36 | def get_train_examples(self, data_dir): 37 | """See base class.""" 38 | lg = self.language if self.train_language is None else self.train_language 39 | lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg))) 40 | examples = [] 41 | for (i, line) in enumerate(lines): 42 | if i == 0: 43 | continue 44 | guid = "%s-%s" % ("train", i) 45 | text_a = line[0] 46 | text_b = line[1] 47 | label = "contradiction" if line[2] == "contradictory" else line[2] 48 | assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) 49 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 50 | return examples 51 | 52 | def get_test_examples(self, data_dir): 53 | """See base class.""" 54 | lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv")) 55 | examples = [] 56 | for (i, line) in enumerate(lines): 57 | if i == 0: 58 | continue 59 | language = line[0] 60 | if language != self.language: 61 | continue 62 | guid = "%s-%s" % ("test", i) 63 | text_a = line[6] 64 | text_b = line[7] 65 | label = line[1] 66 | assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) 67 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 68 | return examples 69 | 70 | def get_labels(self): 71 | """See base class.""" 72 | return ["contradiction", "entailment", "neutral"] 73 | 74 | 75 | xnli_processors = { 76 | "xnli": XnliProcessor, 77 | } 78 | 79 | xnli_output_modes = { 80 | "xnli": "classification", 81 | } 82 | 83 | xnli_tasks_num_labels = { 84 | "xnli": 3, 85 | } 86 | -------------------------------------------------------------------------------- /src/transformers/configuration_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ RoBERTa configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_bert import BertConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", 28 | "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", 29 | "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", 30 | "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json", 31 | "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json", 32 | "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json", 33 | } 34 | 35 | 36 | class RobertaConfig(BertConfig): 37 | r""" 38 | This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`. 39 | It is used to instantiate an RoBERTa model according to the specified arguments, defining the model 40 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of 41 | the BERT `bert-base-uncased `__ architecture. 42 | 43 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used 44 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` 45 | for more information. 46 | 47 | The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. 48 | It reuses the same defaults. Please check the parent class for more information. 49 | 50 | Example:: 51 | 52 | from transformers import RobertaConfig, RobertaModel 53 | 54 | # Initializing a RoBERTa configuration 55 | configuration = RobertaConfig() 56 | 57 | # Initializing a model from the configuration 58 | model = RobertaModel(configuration) 59 | 60 | # Accessing the model configuration 61 | configuration = model.config 62 | 63 | Attributes: 64 | pretrained_config_archive_map (Dict[str, str]): 65 | A dictionary containing all the available pre-trained checkpoints. 66 | """ 67 | pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 68 | model_type = "roberta" 69 | -------------------------------------------------------------------------------- /src/transformers/tokenization_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for DistilBERT.""" 16 | 17 | 18 | import logging 19 | 20 | from .tokenization_bert import BertTokenizer, BertTokenizerFast 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 26 | 27 | PRETRAINED_VOCAB_FILES_MAP = { 28 | "vocab_file": { 29 | "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 30 | "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", 31 | "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", 32 | "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", 33 | "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt", 34 | "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", 35 | } 36 | } 37 | 38 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 39 | "distilbert-base-uncased": 512, 40 | "distilbert-base-uncased-distilled-squad": 512, 41 | "distilbert-base-cased": 512, 42 | "distilbert-base-cased-distilled-squad": 512, 43 | "distilbert-base-german-cased": 512, 44 | "distilbert-base-multilingual-cased": 512, 45 | } 46 | 47 | 48 | PRETRAINED_INIT_CONFIGURATION = { 49 | "distilbert-base-uncased": {"do_lower_case": True}, 50 | "distilbert-base-uncased-distilled-squad": {"do_lower_case": True}, 51 | "distilbert-base-cased": {"do_lower_case": False}, 52 | "distilbert-base-cased-distilled-squad": {"do_lower_case": False}, 53 | "distilbert-base-german-cased": {"do_lower_case": False}, 54 | "distilbert-base-multilingual-cased": {"do_lower_case": False}, 55 | } 56 | 57 | 58 | class DistilBertTokenizer(BertTokenizer): 59 | r""" 60 | Constructs a DistilBertTokenizer. 61 | :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 62 | tokenization: punctuation splitting + wordpiece. 63 | 64 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 65 | parameters. 66 | """ 67 | 68 | vocab_files_names = VOCAB_FILES_NAMES 69 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 70 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 71 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 72 | 73 | 74 | class DistilBertTokenizerFast(BertTokenizerFast): 75 | vocab_files_names = VOCAB_FILES_NAMES 76 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 77 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 78 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FILTER: An Enhanced Fusion Method for Cross-lingual Language Understanding 2 | 3 | This is the official repository of [FILTER](https://arxiv.org/abs/2009.05166). 4 | 5 | ## Requirements 6 | We provide Docker image for easier reproduction. Please use `dockers/Dockerfile` or pull image directly. 7 | ```bash 8 | docker pull studyfang/multilingual:xtreme 9 | ``` 10 | 11 | To run docker without sudo permission, please refer this documentation [Manage Docker as a non-root user](https://docs.docker.com/install/linux/linux-postinstall/). 12 | Then, you could start docker, e.g. 13 | ```bash 14 | docker run --gpus all -it -v /path/to/FILTER:/ssd -it studyfang/multilingual:xtreme bash 15 | ``` 16 | 17 | ## Quick Start 18 | 19 | **NOTE**: Please make sure you have set up the environment correctly. 20 | 21 | 1. Download data and our models 22 | 23 | Please set your `DATA_ROOT` in init.sh, and then run the following command to download specified task and its pretrained FILTER models. 24 | ```bash 25 | bash scripts/download_data.sh ${task} 26 | ``` 27 | 28 | To download all tasks and its pretrained models, please run `bash scripts/download_data.sh` which may take a while. 29 | 30 | 31 | 2. Evaluate our pretrained models which are save in `$DATA_ROOT/outputs/phase${idx}/${task}` : 32 | ```bash 33 | bash eval.sh -t ${task} -n phase${idx}/${task} 34 | ``` 35 | 36 | where 37 | - `idx` could be `1` (without self-teaching) or `2`(+ self-teaching). 38 | - `task` is the name of the task to evaluate from (`[xnli, pawsx, mlqa, tydiqa, xquad, udpos, panx]`) 39 | 40 | ## Model Training 41 | For QA model training, we use translated training data from XTREME team. Please refere to their [repo](https://github.com/google-research/xtreme) or their [translation](https://console.cloud.google.com/storage/browser/xtreme_translations) directly. 42 | Once your data is ready, simply run the following command to train a FILTER model for supported XTREME tasks: 43 | ```bash 44 | bash train.sh -t ${task} -n ${task} 45 | ``` 46 | To use different number of local and fusion layers, you can run this command: 47 | ```bash 48 | bash train.sh -t ${task} -n ${task}_k${k}_m${m} -x "--filter_k ${k} --filter_m ${m}" 49 | ``` 50 | 51 | where 52 | - `task` is the name of the task to train from (`[xnli, pawsx, mlqa, tydiqa, xquad, udpos, panx]`) 53 | - `k` is the number of fusion layers 54 | - `m` is the number of local layers 55 | 56 | The output model will be save into `${DATA_ROOT}/outputs/${task}_k${k}_m${m}`. 57 | 58 | **Note that we ran experiments on 8 V100 GPUs for FILTER models. You may need to increase `gradient_accumulation_steps` if you have less GPUs.** 59 | 60 | 61 | ## Citation 62 | If you use this code useful, please star our repo or consider citing: 63 | ``` 64 | @article{fang2020filter, 65 | title={FILTER: An enhanced fusion method for cross-lingual language understanding}, 66 | author={Fang, Yuwei and Wang, Shuohang and Gan, Zhe and Sun, Siqi and Liu, Jingjing}, 67 | journal={arXiv preprint arXiv:2009.05166}, 68 | year={2020} 69 | } 70 | ``` 71 | 72 | ## Contributing 73 | 74 | This project welcomes contributions and suggestions. Most contributions require you to 75 | agree to a Contributor License Agreement (CLA) declaring that you have the right to, 76 | and actually do, grant us the rights to use your contribution. For details, visit 77 | https://cla.microsoft.com. 78 | 79 | When you submit a pull request, a CLA-bot will automatically determine whether you need 80 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the 81 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA. 82 | 83 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 84 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 85 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 86 | 87 | ## License 88 | 89 | MIT 90 | -------------------------------------------------------------------------------- /src/transformers/commands/run.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from argparse import ArgumentParser 3 | 4 | from transformers.commands import BaseTransformersCLICommand 5 | from transformers.pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline 6 | 7 | 8 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 9 | 10 | 11 | def try_infer_format_from_ext(path: str): 12 | if not path: 13 | return "pipe" 14 | 15 | for ext in PipelineDataFormat.SUPPORTED_FORMATS: 16 | if path.endswith(ext): 17 | return ext 18 | 19 | raise Exception( 20 | "Unable to determine file format from file extension {}. " 21 | "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS) 22 | ) 23 | 24 | 25 | def run_command_factory(args): 26 | nlp = pipeline( 27 | task=args.task, 28 | model=args.model if args.model else None, 29 | config=args.config, 30 | tokenizer=args.tokenizer, 31 | device=args.device, 32 | ) 33 | format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format 34 | reader = PipelineDataFormat.from_str( 35 | format=format, 36 | output_path=args.output, 37 | input_path=args.input, 38 | column=args.column if args.column else nlp.default_input_names, 39 | overwrite=args.overwrite, 40 | ) 41 | return RunCommand(nlp, reader) 42 | 43 | 44 | class RunCommand(BaseTransformersCLICommand): 45 | def __init__(self, nlp: Pipeline, reader: PipelineDataFormat): 46 | self._nlp = nlp 47 | self._reader = reader 48 | 49 | @staticmethod 50 | def register_subcommand(parser: ArgumentParser): 51 | run_parser = parser.add_parser("run", help="Run a pipeline through the CLI") 52 | run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run") 53 | run_parser.add_argument("--input", type=str, help="Path to the file to use for inference") 54 | run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.") 55 | run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.") 56 | run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.") 57 | run_parser.add_argument( 58 | "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)" 59 | ) 60 | run_parser.add_argument( 61 | "--column", 62 | type=str, 63 | help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)", 64 | ) 65 | run_parser.add_argument( 66 | "--format", 67 | type=str, 68 | default="infer", 69 | choices=PipelineDataFormat.SUPPORTED_FORMATS, 70 | help="Input format to read from", 71 | ) 72 | run_parser.add_argument( 73 | "--device", 74 | type=int, 75 | default=-1, 76 | help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)", 77 | ) 78 | run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.") 79 | run_parser.set_defaults(func=run_command_factory) 80 | 81 | def run(self): 82 | nlp, outputs = self._nlp, [] 83 | 84 | for entry in self._reader: 85 | output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry) 86 | if isinstance(output, dict): 87 | outputs.append(output) 88 | else: 89 | outputs += output 90 | 91 | # Saving data 92 | if self._nlp.binary_output: 93 | binary_path = self._reader.save_binary(outputs) 94 | logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path)) 95 | else: 96 | self._reader.save(outputs) 97 | -------------------------------------------------------------------------------- /src/transformers/configuration_bart.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ BART configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | BART_PRETRAINED_CONFIG_ARCHIVE_MAP = { 26 | "bart-large": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large/config.json", 27 | "bart-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-mnli/config.json", 28 | "bart-large-cnn": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/config.json", 29 | } 30 | 31 | 32 | class BartConfig(PretrainedConfig): 33 | r""" 34 | Configuration class for Bart. Parameters are renamed from the fairseq implementation 35 | """ 36 | model_type = "bart" 37 | pretrained_config_archive_map = BART_PRETRAINED_CONFIG_ARCHIVE_MAP 38 | 39 | def __init__( 40 | self, 41 | activation_dropout=0.0, 42 | vocab_size=50265, 43 | pad_token_id=1, 44 | eos_token_id=2, 45 | d_model=1024, 46 | encoder_ffn_dim=4096, 47 | encoder_layers=12, 48 | encoder_attention_heads=16, 49 | decoder_ffn_dim=4096, 50 | decoder_layers=12, 51 | decoder_attention_heads=16, 52 | encoder_layerdrop=0.0, 53 | decoder_layerdrop=0.0, 54 | attention_dropout=0.0, 55 | dropout=0.1, 56 | max_position_embeddings=1024, 57 | init_std=0.02, 58 | classifier_dropout=0.0, 59 | output_past=False, 60 | num_labels=3, 61 | bos_token_id=0, 62 | **common_kwargs 63 | ): 64 | r""" 65 | :class:`~transformers.BartConfig` is the configuration class for `BartModel`. 66 | Examples: 67 | config = BartConfig.from_pretrained('bart-large') 68 | model = BartModel(config) 69 | """ 70 | super().__init__( 71 | num_labels=num_labels, 72 | output_past=output_past, 73 | pad_token_id=pad_token_id, 74 | bos_token_id=bos_token_id, 75 | **common_kwargs, 76 | ) 77 | self.vocab_size = vocab_size 78 | self.d_model = d_model # encoder_embed_dim and decoder_embed_dim 79 | self.eos_token_id = eos_token_id 80 | self.encoder_ffn_dim = encoder_ffn_dim 81 | self.encoder_layers = self.num_hidden_layers = encoder_layers 82 | self.encoder_attention_heads = encoder_attention_heads 83 | self.encoder_layerdrop = encoder_layerdrop 84 | self.decoder_layerdrop = decoder_layerdrop 85 | self.decoder_ffn_dim = decoder_ffn_dim 86 | self.decoder_layers = decoder_layers 87 | self.decoder_attention_heads = decoder_attention_heads 88 | self.max_position_embeddings = max_position_embeddings 89 | self.init_std = init_std # Normal(0, this parameter) 90 | 91 | # 3 Types of Dropout 92 | self.attention_dropout = attention_dropout 93 | self.activation_dropout = activation_dropout 94 | self.dropout = dropout 95 | 96 | # Classifier stuff 97 | self.classif_dropout = classifier_dropout 98 | 99 | @property 100 | def num_attention_heads(self): 101 | return self.encoder_attention_heads 102 | 103 | @property 104 | def hidden_size(self): 105 | return self.d_model 106 | -------------------------------------------------------------------------------- /src/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | import os 21 | 22 | import torch 23 | 24 | from transformers import ( 25 | CONFIG_NAME, 26 | WEIGHTS_NAME, 27 | XLNetConfig, 28 | XLNetForQuestionAnswering, 29 | XLNetForSequenceClassification, 30 | XLNetLMHeadModel, 31 | load_tf_weights_in_xlnet, 32 | ) 33 | 34 | 35 | GLUE_TASKS_NUM_LABELS = { 36 | "cola": 2, 37 | "mnli": 3, 38 | "mrpc": 2, 39 | "sst-2": 2, 40 | "sts-b": 1, 41 | "qqp": 2, 42 | "qnli": 2, 43 | "rte": 2, 44 | "wnli": 2, 45 | } 46 | 47 | 48 | logging.basicConfig(level=logging.INFO) 49 | 50 | 51 | def convert_xlnet_checkpoint_to_pytorch( 52 | tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None 53 | ): 54 | # Initialise PyTorch model 55 | config = XLNetConfig.from_json_file(bert_config_file) 56 | 57 | finetuning_task = finetuning_task.lower() if finetuning_task is not None else "" 58 | if finetuning_task in GLUE_TASKS_NUM_LABELS: 59 | print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config))) 60 | config.finetuning_task = finetuning_task 61 | config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task] 62 | model = XLNetForSequenceClassification(config) 63 | elif "squad" in finetuning_task: 64 | config.finetuning_task = finetuning_task 65 | model = XLNetForQuestionAnswering(config) 66 | else: 67 | model = XLNetLMHeadModel(config) 68 | 69 | # Load weights from tf checkpoint 70 | load_tf_weights_in_xlnet(model, config, tf_checkpoint_path) 71 | 72 | # Save pytorch-model 73 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) 74 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) 75 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) 76 | torch.save(model.state_dict(), pytorch_weights_dump_path) 77 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) 78 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 79 | f.write(config.to_json_string()) 80 | 81 | 82 | if __name__ == "__main__": 83 | parser = argparse.ArgumentParser() 84 | # Required parameters 85 | parser.add_argument( 86 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 87 | ) 88 | parser.add_argument( 89 | "--xlnet_config_file", 90 | default=None, 91 | type=str, 92 | required=True, 93 | help="The config json file corresponding to the pre-trained XLNet model. \n" 94 | "This specifies the model architecture.", 95 | ) 96 | parser.add_argument( 97 | "--pytorch_dump_folder_path", 98 | default=None, 99 | type=str, 100 | required=True, 101 | help="Path to the folder to store the PyTorch model or dataset/vocab.", 102 | ) 103 | parser.add_argument( 104 | "--finetuning_task", 105 | default=None, 106 | type=str, 107 | help="Name of a task on which the XLNet TensorFloaw model was fine-tuned", 108 | ) 109 | args = parser.parse_args() 110 | print(args) 111 | 112 | convert_xlnet_checkpoint_to_pytorch( 113 | args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task 114 | ) 115 | -------------------------------------------------------------------------------- /src/transformers/configuration_t5.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2010, The T5 Authors and HuggingFace Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ T5 model configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | T5_PRETRAINED_CONFIG_ARCHIVE_MAP = { 26 | "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json", 27 | "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json", 28 | "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json", 29 | "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json", 30 | "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json", 31 | } 32 | 33 | 34 | class T5Config(PretrainedConfig): 35 | r""" 36 | :class:`~transformers.T5Config` is the configuration class to store the configuration of a 37 | `T5Model`. 38 | 39 | 40 | Arguments: 41 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`. 42 | hidden_size: Size of the encoder layers and the pooler layer. 43 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 44 | num_attention_heads: Number of attention heads for each attention layer in 45 | the Transformer encoder. 46 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 47 | layer in the Transformer encoder. 48 | hidden_act: The non-linear activation function (function or string) in the 49 | encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. 50 | hidden_dropout_prob: The dropout probabilitiy for all fully connected 51 | layers in the embeddings, encoder, and pooler. 52 | attention_probs_dropout_prob: The dropout ratio for the attention 53 | probabilities. 54 | max_position_embeddings: The maximum sequence length that this model might 55 | ever be used with. Typically set this to something large just in case 56 | (e.g., 512 or 1024 or 2048). 57 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 58 | `T5Model`. 59 | initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing). 60 | layer_norm_eps: The epsilon used by LayerNorm. 61 | """ 62 | pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP 63 | model_type = "t5" 64 | 65 | def __init__( 66 | self, 67 | vocab_size=32128, 68 | n_positions=512, 69 | d_model=512, 70 | d_kv=64, 71 | d_ff=2048, 72 | num_layers=6, 73 | num_heads=8, 74 | relative_attention_num_buckets=32, 75 | dropout_rate=0.1, 76 | layer_norm_epsilon=1e-6, 77 | initializer_factor=1.0, 78 | **kwargs 79 | ): 80 | super().__init__(**kwargs) 81 | self.vocab_size = vocab_size 82 | self.n_positions = n_positions 83 | self.d_model = d_model 84 | self.d_kv = d_kv 85 | self.d_ff = d_ff 86 | self.num_layers = num_layers 87 | self.num_heads = num_heads 88 | self.relative_attention_num_buckets = relative_attention_num_buckets 89 | self.dropout_rate = dropout_rate 90 | self.layer_norm_epsilon = layer_norm_epsilon 91 | self.initializer_factor = initializer_factor 92 | 93 | @property 94 | def max_position_embeddings(self): 95 | return self.n_positions 96 | 97 | @property 98 | def hidden_size(self): 99 | return self.d_model 100 | 101 | @property 102 | def num_attention_heads(self): 103 | return self.num_heads 104 | 105 | @property 106 | def num_hidden_layers(self): 107 | return self.num_layers 108 | -------------------------------------------------------------------------------- /src/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" 17 | 18 | import argparse 19 | import os 20 | 21 | import numpy as np 22 | import tensorflow as tf 23 | import torch 24 | 25 | from transformers import BertModel 26 | 27 | 28 | def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): 29 | 30 | """ 31 | :param model:BertModel Pytorch model instance to be converted 32 | :param ckpt_dir: Tensorflow model directory 33 | :param model_name: model name 34 | :return: 35 | 36 | Currently supported HF models: 37 | Y BertModel 38 | N BertForMaskedLM 39 | N BertForPreTraining 40 | N BertForMultipleChoice 41 | N BertForNextSentencePrediction 42 | N BertForSequenceClassification 43 | N BertForQuestionAnswering 44 | """ 45 | 46 | tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") 47 | 48 | var_map = ( 49 | ("layer.", "layer_"), 50 | ("word_embeddings.weight", "word_embeddings"), 51 | ("position_embeddings.weight", "position_embeddings"), 52 | ("token_type_embeddings.weight", "token_type_embeddings"), 53 | (".", "/"), 54 | ("LayerNorm/weight", "LayerNorm/gamma"), 55 | ("LayerNorm/bias", "LayerNorm/beta"), 56 | ("weight", "kernel"), 57 | ) 58 | 59 | if not os.path.isdir(ckpt_dir): 60 | os.makedirs(ckpt_dir) 61 | 62 | state_dict = model.state_dict() 63 | 64 | def to_tf_var_name(name: str): 65 | for patt, repl in iter(var_map): 66 | name = name.replace(patt, repl) 67 | return "bert/{}".format(name) 68 | 69 | def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): 70 | tf_dtype = tf.dtypes.as_dtype(tensor.dtype) 71 | tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) 72 | session.run(tf.variables_initializer([tf_var])) 73 | session.run(tf_var) 74 | return tf_var 75 | 76 | tf.reset_default_graph() 77 | with tf.Session() as session: 78 | for var_name in state_dict: 79 | tf_name = to_tf_var_name(var_name) 80 | torch_tensor = state_dict[var_name].numpy() 81 | if any([x in var_name for x in tensors_to_transpose]): 82 | torch_tensor = torch_tensor.T 83 | tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) 84 | tf.keras.backend.set_value(tf_var, torch_tensor) 85 | tf_weight = session.run(tf_var) 86 | print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor))) 87 | 88 | saver = tf.train.Saver(tf.trainable_variables()) 89 | saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) 90 | 91 | 92 | def main(raw_args=None): 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased") 95 | parser.add_argument( 96 | "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model" 97 | ) 98 | parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/.bin") 99 | parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") 100 | args = parser.parse_args(raw_args) 101 | 102 | model = BertModel.from_pretrained( 103 | pretrained_model_name_or_path=args.model_name, 104 | state_dict=torch.load(args.pytorch_model_path), 105 | cache_dir=args.cache_dir, 106 | ) 107 | 108 | convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name) 109 | 110 | 111 | if __name__ == "__main__": 112 | main() 113 | -------------------------------------------------------------------------------- /src/transformers/convert_bart_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BART checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | from pathlib import Path 21 | 22 | import fairseq 23 | import torch 24 | from packaging import version 25 | 26 | from transformers import BartConfig, BartForMaskedLM, BartForSequenceClassification, BartModel, BartTokenizer 27 | 28 | 29 | FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn"] 30 | 31 | if version.parse(fairseq.__version__) < version.parse("0.9.0"): 32 | raise Exception("requires fairseq >= 0.9.0") 33 | 34 | 35 | logging.basicConfig(level=logging.INFO) 36 | logger = logging.getLogger(__name__) 37 | 38 | SAMPLE_TEXT = " Hello world! cécé herlolip" 39 | 40 | rename_keys = [ 41 | ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"), 42 | ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"), 43 | ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"), 44 | ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"), 45 | ] 46 | IGNORE_KEYS = ["encoder.version", "decoder.version", "model.encoder.version", "model.decoder.version", "_float_tensor"] 47 | 48 | 49 | def rename_key(dct, old, new): 50 | val = dct.pop(old) 51 | dct[new] = val 52 | 53 | 54 | def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path): 55 | """ 56 | Copy/paste/tweak model's weights to our BERT structure. 57 | """ 58 | bart = torch.hub.load("pytorch/fairseq", checkpoint_path) 59 | bart.eval() # disable dropout 60 | bart.model.upgrade_state_dict(bart.model.state_dict()) 61 | hf_model_name = checkpoint_path.replace(".", "-") 62 | config = BartConfig.from_pretrained(hf_model_name) 63 | tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0) 64 | tokens2 = BartTokenizer.from_pretrained(hf_model_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0) 65 | assert torch.eq(tokens, tokens2).all() 66 | 67 | if checkpoint_path in ["bart.large", "bart.large.cnn"]: 68 | state_dict = bart.model.state_dict() 69 | for k in IGNORE_KEYS: 70 | state_dict.pop(k, None) 71 | state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] 72 | model = BartModel(config) 73 | their_output = bart.extract_features(tokens) 74 | else: # MNLI Case 75 | state_dict = bart.state_dict() 76 | for k in IGNORE_KEYS: 77 | state_dict.pop(k, None) 78 | state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"] 79 | for src, dest in rename_keys: 80 | rename_key(state_dict, src, dest) 81 | model = BartForSequenceClassification(config) 82 | their_output = bart.predict("mnli", tokens, return_logits=True) 83 | 84 | # Load state dict 85 | model.load_state_dict(state_dict) 86 | model.eval() 87 | # Check results 88 | 89 | if checkpoint_path == "bart.large.cnn": # generate doesnt work yet 90 | model = BartForMaskedLM(config, base_model=model) 91 | assert "lm_head.weight" in model.state_dict() 92 | assert model.lm_head.out_features == config.max_position_embeddings 93 | model.eval() 94 | our_outputs = model.model.forward(tokens)[0] 95 | else: 96 | our_outputs = model.forward(tokens)[0] 97 | assert their_output.shape == our_outputs.shape 98 | assert (their_output == our_outputs).all().item() 99 | Path(pytorch_dump_folder_path).mkdir(exist_ok=True) 100 | model.save_pretrained(pytorch_dump_folder_path) 101 | 102 | 103 | if __name__ == "__main__": 104 | parser = argparse.ArgumentParser() 105 | # Required parameters 106 | parser.add_argument("fairseq_path", choices=FAIRSEQ_MODELS, type=str, help="") 107 | 108 | parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") 109 | args = parser.parse_args() 110 | convert_bart_checkpoint( 111 | args.fairseq_path, args.pytorch_dump_folder_path, 112 | ) 113 | -------------------------------------------------------------------------------- /src/transformers/modeling_tf_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ TF 2.0 RoBERTa model. """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_camembert import CamembertConfig 22 | from .file_utils import add_start_docstrings 23 | from .modeling_tf_roberta import ( 24 | TFRobertaForMaskedLM, 25 | TFRobertaForSequenceClassification, 26 | TFRobertaForTokenClassification, 27 | TFRobertaModel, 28 | ) 29 | 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {} 34 | 35 | 36 | CAMEMBERT_START_DOCSTRING = r""" 37 | 38 | .. note:: 39 | 40 | TF 2.0 models accepts two formats as inputs: 41 | 42 | - having all inputs as keyword arguments (like PyTorch models), or 43 | - having all inputs as a list, tuple or dict in the first positional arguments. 44 | 45 | This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having 46 | all the tensors in the first argument of the model call function: :obj:`model(inputs)`. 47 | 48 | If you choose this second option, there are three possibilities you can use to gather all the input Tensors 49 | in the first positional argument : 50 | 51 | - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` 52 | - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: 53 | :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` 54 | - a dictionary with one or several input Tensors associated to the input names given in the docstring: 55 | :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` 56 | 57 | Parameters: 58 | config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the 59 | model. Initializing with a config file does not load the weights associated with the model, only the configuration. 60 | Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. 61 | """ 62 | 63 | 64 | @add_start_docstrings( 65 | "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", 66 | CAMEMBERT_START_DOCSTRING, 67 | ) 68 | class TFCamembertModel(TFRobertaModel): 69 | """ 70 | This class overrides :class:`~transformers.TFRobertaModel`. Please check the 71 | superclass for the appropriate documentation alongside usage examples. 72 | """ 73 | 74 | config_class = CamembertConfig 75 | pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP 76 | 77 | 78 | @add_start_docstrings( 79 | """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING, 80 | ) 81 | class TFCamembertForMaskedLM(TFRobertaForMaskedLM): 82 | """ 83 | This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the 84 | superclass for the appropriate documentation alongside usage examples. 85 | """ 86 | 87 | config_class = CamembertConfig 88 | pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP 89 | 90 | 91 | @add_start_docstrings( 92 | """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer 93 | on top of the pooled output) e.g. for GLUE tasks. """, 94 | CAMEMBERT_START_DOCSTRING, 95 | ) 96 | class TFCamembertForSequenceClassification(TFRobertaForSequenceClassification): 97 | """ 98 | This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the 99 | superclass for the appropriate documentation alongside usage examples. 100 | """ 101 | 102 | config_class = CamembertConfig 103 | pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP 104 | 105 | 106 | @add_start_docstrings( 107 | """CamemBERT Model with a token classification head on top (a linear layer on top of 108 | the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, 109 | CAMEMBERT_START_DOCSTRING, 110 | ) 111 | class TFCamembertForTokenClassification(TFRobertaForTokenClassification): 112 | """ 113 | This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the 114 | superclass for the appropriate documentation alongside usage examples. 115 | """ 116 | 117 | config_class = CamembertConfig 118 | pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP 119 | -------------------------------------------------------------------------------- /src/transformers/modeling_tf_xlm_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ TF 2.0 XLM-RoBERTa model. """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_xlm_roberta import XLMRobertaConfig 22 | from .file_utils import add_start_docstrings 23 | from .modeling_tf_roberta import ( 24 | TFRobertaForMaskedLM, 25 | TFRobertaForSequenceClassification, 26 | TFRobertaForTokenClassification, 27 | TFRobertaModel, 28 | ) 29 | 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {} 34 | 35 | 36 | XLM_ROBERTA_START_DOCSTRING = r""" 37 | 38 | .. note:: 39 | 40 | TF 2.0 models accepts two formats as inputs: 41 | 42 | - having all inputs as keyword arguments (like PyTorch models), or 43 | - having all inputs as a list, tuple or dict in the first positional arguments. 44 | 45 | This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having 46 | all the tensors in the first argument of the model call function: :obj:`model(inputs)`. 47 | 48 | If you choose this second option, there are three possibilities you can use to gather all the input Tensors 49 | in the first positional argument : 50 | 51 | - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` 52 | - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: 53 | :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` 54 | - a dictionary with one or several input Tensors associated to the input names given in the docstring: 55 | :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` 56 | 57 | Parameters: 58 | config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the 59 | model. Initializing with a config file does not load the weights associated with the model, only the configuration. 60 | Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. 61 | """ 62 | 63 | 64 | @add_start_docstrings( 65 | "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", 66 | XLM_ROBERTA_START_DOCSTRING, 67 | ) 68 | class TFXLMRobertaModel(TFRobertaModel): 69 | """ 70 | This class overrides :class:`~transformers.TFRobertaModel`. Please check the 71 | superclass for the appropriate documentation alongside usage examples. 72 | """ 73 | 74 | config_class = XLMRobertaConfig 75 | pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 76 | 77 | 78 | @add_start_docstrings( 79 | """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING, 80 | ) 81 | class TFXLMRobertaForMaskedLM(TFRobertaForMaskedLM): 82 | """ 83 | This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the 84 | superclass for the appropriate documentation alongside usage examples. 85 | """ 86 | 87 | config_class = XLMRobertaConfig 88 | pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 89 | 90 | 91 | @add_start_docstrings( 92 | """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 93 | on top of the pooled output) e.g. for GLUE tasks. """, 94 | XLM_ROBERTA_START_DOCSTRING, 95 | ) 96 | class TFXLMRobertaForSequenceClassification(TFRobertaForSequenceClassification): 97 | """ 98 | This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the 99 | superclass for the appropriate documentation alongside usage examples. 100 | """ 101 | 102 | config_class = XLMRobertaConfig 103 | pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 104 | 105 | 106 | @add_start_docstrings( 107 | """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of 108 | the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, 109 | XLM_ROBERTA_START_DOCSTRING, 110 | ) 111 | class TFXLMRobertaForTokenClassification(TFRobertaForTokenClassification): 112 | """ 113 | This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the 114 | superclass for the appropriate documentation alongside usage examples. 115 | """ 116 | 117 | config_class = XLMRobertaConfig 118 | pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 119 | -------------------------------------------------------------------------------- /src/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert Transformer XL checkpoint and datasets.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | import os 21 | import pickle 22 | import sys 23 | 24 | import torch 25 | 26 | import transformers.tokenization_transfo_xl as data_utils 27 | from transformers import ( 28 | CONFIG_NAME, 29 | WEIGHTS_NAME, 30 | TransfoXLConfig, 31 | TransfoXLLMHeadModel, 32 | load_tf_weights_in_transfo_xl, 33 | ) 34 | from transformers.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES 35 | 36 | 37 | logging.basicConfig(level=logging.INFO) 38 | 39 | # We do this to be able to load python 2 datasets pickles 40 | # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918 41 | data_utils.Vocab = data_utils.TransfoXLTokenizer 42 | data_utils.Corpus = data_utils.TransfoXLCorpus 43 | sys.modules["data_utils"] = data_utils 44 | sys.modules["vocabulary"] = data_utils 45 | 46 | 47 | def convert_transfo_xl_checkpoint_to_pytorch( 48 | tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file 49 | ): 50 | if transfo_xl_dataset_file: 51 | # Convert a pre-processed corpus (see original TensorFlow repo) 52 | with open(transfo_xl_dataset_file, "rb") as fp: 53 | corpus = pickle.load(fp, encoding="latin1") 54 | # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) 55 | pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"] 56 | print("Save vocabulary to {}".format(pytorch_vocab_dump_path)) 57 | corpus_vocab_dict = corpus.vocab.__dict__ 58 | torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) 59 | 60 | corpus_dict_no_vocab = corpus.__dict__ 61 | corpus_dict_no_vocab.pop("vocab", None) 62 | pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME 63 | print("Save dataset to {}".format(pytorch_dataset_dump_path)) 64 | torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) 65 | 66 | if tf_checkpoint_path: 67 | # Convert a pre-trained TensorFlow model 68 | config_path = os.path.abspath(transfo_xl_config_file) 69 | tf_path = os.path.abspath(tf_checkpoint_path) 70 | 71 | print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path)) 72 | # Initialise PyTorch model 73 | if transfo_xl_config_file == "": 74 | config = TransfoXLConfig() 75 | else: 76 | config = TransfoXLConfig.from_json_file(transfo_xl_config_file) 77 | print("Building PyTorch model from configuration: {}".format(str(config))) 78 | model = TransfoXLLMHeadModel(config) 79 | 80 | model = load_tf_weights_in_transfo_xl(model, config, tf_path) 81 | # Save pytorch-model 82 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) 83 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) 84 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) 85 | torch.save(model.state_dict(), pytorch_weights_dump_path) 86 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) 87 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 88 | f.write(config.to_json_string()) 89 | 90 | 91 | if __name__ == "__main__": 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument( 94 | "--pytorch_dump_folder_path", 95 | default=None, 96 | type=str, 97 | required=True, 98 | help="Path to the folder to store the PyTorch model or dataset/vocab.", 99 | ) 100 | parser.add_argument( 101 | "--tf_checkpoint_path", 102 | default="", 103 | type=str, 104 | help="An optional path to a TensorFlow checkpoint path to be converted.", 105 | ) 106 | parser.add_argument( 107 | "--transfo_xl_config_file", 108 | default="", 109 | type=str, 110 | help="An optional config json file corresponding to the pre-trained BERT model. \n" 111 | "This specifies the model architecture.", 112 | ) 113 | parser.add_argument( 114 | "--transfo_xl_dataset_file", 115 | default="", 116 | type=str, 117 | help="An optional dataset file to be converted in a vocabulary.", 118 | ) 119 | args = parser.parse_args() 120 | convert_transfo_xl_checkpoint_to_pytorch( 121 | args.tf_checkpoint_path, 122 | args.transfo_xl_config_file, 123 | args.pytorch_dump_folder_path, 124 | args.transfo_xl_dataset_file, 125 | ) 126 | -------------------------------------------------------------------------------- /src/transformers/data/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | try: 18 | from scipy.stats import pearsonr, spearmanr 19 | from sklearn.metrics import matthews_corrcoef, f1_score, average_precision_score, ndcg_score, roc_auc_score 20 | import numpy as np 21 | _has_sklearn = True 22 | except (AttributeError, ImportError): 23 | _has_sklearn = False 24 | 25 | 26 | def is_sklearn_available(): 27 | return _has_sklearn 28 | 29 | 30 | if _has_sklearn: 31 | 32 | def simple_accuracy(preds, labels): 33 | return (preds == labels).mean() 34 | 35 | def simple_ndcg(preds, labels, guids): 36 | ndcgs = [] 37 | query2content = {} 38 | for guid, pred, label in zip(guids, preds, labels): 39 | query = guid.split("_")[0] 40 | if not query in query2content: 41 | query2content[query] = [[int(pred)], [int(label)]] 42 | else: 43 | query2content[query][0].append(int(pred)) 44 | query2content[query][1].append(int(label)) 45 | 46 | for key in query2content.keys(): 47 | if len(query2content[key][1]) < 2 or len(query2content[key][0]) < 2: 48 | continue 49 | ndcgs.append(ndcg_score(np.asarray([query2content[key][1]]), np.asarray([query2content[key][0]]))) 50 | return {"ndcg" : np.array(ndcgs).mean()} 51 | 52 | def acc_and_f1(preds, labels): 53 | acc = simple_accuracy(preds, labels) 54 | f1 = f1_score(y_true=labels, y_pred=preds) 55 | return { 56 | "acc": acc, 57 | "f1": f1, 58 | "acc_and_f1": (acc + f1) / 2, 59 | } 60 | 61 | def acc_and_auc(preds, labels): # auc of pr curve is equal to average precision 62 | acc = simple_accuracy(preds, labels) 63 | auc = average_precision_score(labels, preds) 64 | return { 65 | "acc": acc, 66 | "auc": auc, 67 | "acc_and_auc": (acc + auc) / 2, 68 | } 69 | 70 | def acc_and_roc_auc(preds, labels): # auc of pr curve is equal to average precision 71 | acc = simple_accuracy(preds, labels) 72 | roc_auc = roc_auc_score(labels, preds) 73 | return { 74 | "acc": acc, 75 | "roc_auc": roc_auc, 76 | "acc_and_roc_auc": (acc + roc_auc) / 2, 77 | } 78 | 79 | def pearson_and_spearman(preds, labels): 80 | pearson_corr = pearsonr(preds, labels)[0] 81 | spearman_corr = spearmanr(preds, labels)[0] 82 | return { 83 | "pearson": pearson_corr, 84 | "spearmanr": spearman_corr, 85 | "corr": (pearson_corr + spearman_corr) / 2, 86 | } 87 | 88 | def xglue_compute_metrics(task_name, preds, labels, guids): 89 | assert len(preds) == len(labels) 90 | if task_name == "xnli": 91 | return {"acc": simple_accuracy(preds, labels)} 92 | elif task_name == "pawsx": 93 | return acc_and_auc(preds, labels) 94 | elif task_name == "qam": 95 | return acc_and_auc(preds, labels) 96 | elif task_name == "ads": 97 | return acc_and_roc_auc(preds, labels) 98 | elif task_name == "rel": 99 | return simple_ndcg(preds, labels, guids) 100 | elif task_name == "news": 101 | return {"acc": simple_accuracy(preds, labels)} 102 | else: 103 | raise KeyError(task_name) 104 | 105 | 106 | def glue_compute_metrics(task_name, preds, labels): 107 | assert len(preds) == len(labels) 108 | if task_name == "cola": 109 | return {"mcc": matthews_corrcoef(labels, preds)} 110 | elif task_name == "sst-2": 111 | return {"acc": simple_accuracy(preds, labels)} 112 | elif task_name == "mrpc": 113 | return acc_and_f1(preds, labels) 114 | elif task_name == "sts-b": 115 | return pearson_and_spearman(preds, labels) 116 | elif task_name == "qqp": 117 | return acc_and_f1(preds, labels) 118 | elif task_name == "mnli": 119 | return {"acc": simple_accuracy(preds, labels)} 120 | elif task_name == "mnli-mm": 121 | return {"acc": simple_accuracy(preds, labels)} 122 | elif task_name == "qnli": 123 | return {"acc": simple_accuracy(preds, labels)} 124 | elif task_name == "rte": 125 | return {"acc": simple_accuracy(preds, labels)} 126 | elif task_name == "wnli": 127 | return {"acc": simple_accuracy(preds, labels)} 128 | elif task_name == "hans": 129 | return {"acc": simple_accuracy(preds, labels)} 130 | else: 131 | raise KeyError(task_name) 132 | 133 | def xnli_compute_metrics(task_name, preds, labels): 134 | assert len(preds) == len(labels) 135 | if task_name == "xnli": 136 | return {"acc": simple_accuracy(preds, labels)} 137 | else: 138 | raise KeyError(task_name) 139 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py 3 | 4 | To create the package for pypi. 5 | 6 | 1. Change the version in __init__.py, setup.py as well as docs/source/conf.py. 7 | 8 | 2. Commit these changes with the message: "Release: VERSION" 9 | 10 | 3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' " 11 | Push the tag to git: git push --tags origin master 12 | 13 | 4. Build both the sources and the wheel. Do not change anything in setup.py between 14 | creating the wheel and the source distribution (obviously). 15 | 16 | For the wheel, run: "python setup.py bdist_wheel" in the top level directory. 17 | (this will build a wheel for the python version you use to build it). 18 | 19 | For the sources, run: "python setup.py sdist" 20 | You should now have a /dist directory with both .whl and .tar.gz source versions. 21 | 22 | 5. Check that everything looks correct by uploading the package to the pypi test server: 23 | 24 | twine upload dist/* -r pypitest 25 | (pypi suggest using twine as other methods upload files via plaintext.) 26 | You may have to specify the repository url, use the following command then: 27 | twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/ 28 | 29 | Check that you can install it in a virtualenv by running: 30 | pip install -i https://testpypi.python.org/pypi transformers 31 | 32 | 6. Upload the final version to actual pypi: 33 | twine upload dist/* -r pypi 34 | 35 | 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory. 36 | 37 | 8. Update the documentation commit in .circleci/deploy.sh for the accurate documentation to be displayed 38 | 39 | 9. Update README.md to redirect to correct documentation. 40 | """ 41 | 42 | import shutil 43 | from pathlib import Path 44 | 45 | from setuptools import find_packages, setup 46 | 47 | 48 | # Remove stale transformers.egg-info directory to avoid https://github.com/pypa/pip/issues/5466 49 | stale_egg_info = Path(__file__).parent / "transformers.egg-info" 50 | if stale_egg_info.exists(): 51 | print( 52 | ( 53 | "Warning: {} exists.\n\n" 54 | "If you recently updated transformers to 3.0 or later, this is expected,\n" 55 | "but it may prevent transformers from installing in editable mode.\n\n" 56 | "This directory is automatically generated by Python's packaging tools.\n" 57 | "I will remove it now.\n\n" 58 | "See https://github.com/pypa/pip/issues/5466 for details.\n" 59 | ).format(stale_egg_info) 60 | ) 61 | shutil.rmtree(stale_egg_info) 62 | 63 | 64 | extras = {} 65 | 66 | extras["mecab"] = ["mecab-python3"] 67 | extras["sklearn"] = ["scikit-learn==0.22.1"] 68 | extras["tf"] = ["tensorflow"] 69 | extras["tf-cpu"] = ["tensorflow-cpu"] 70 | extras["torch"] = ["torch"] 71 | 72 | extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"] 73 | extras["all"] = extras["serving"] + ["tensorflow", "torch"] 74 | 75 | extras["testing"] = ["pytest", "pytest-xdist"] 76 | extras["quality"] = ["black", "isort", "flake8"] 77 | extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme"] 78 | extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow", "torch"] 79 | 80 | setup( 81 | name="transformers", 82 | version="2.5.1", 83 | author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors", 84 | author_email="thomas@huggingface.co", 85 | description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch", 86 | long_description=open("README.md", "r", encoding="utf-8").read(), 87 | long_description_content_type="text/markdown", 88 | keywords="NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU", 89 | license="Apache", 90 | url="https://github.com/huggingface/transformers", 91 | package_dir={"": "src"}, 92 | packages=find_packages("src"), 93 | install_requires=[ 94 | "numpy", 95 | "tokenizers == 0.5.2", 96 | # accessing files from S3 directly 97 | "boto3", 98 | # filesystem locks e.g. to prevent parallel downloads 99 | "filelock", 100 | # for downloading models over HTTPS 101 | "requests", 102 | # progress bars in model download and training scripts 103 | "tqdm >= 4.27", 104 | # for OpenAI GPT 105 | "regex != 2019.12.17", 106 | # for XLNet 107 | "sentencepiece == 0.1.92", 108 | # for XLM 109 | "sacremoses", 110 | # for ndcg 111 | "scikit-learn == 0.22", 112 | # for tensorboard 113 | "tensorboardX", 114 | # for ner 115 | "seqeval", 116 | # for torch 117 | "torch", 118 | ], 119 | extras_require=extras, 120 | scripts=["transformers-cli"], 121 | python_requires=">=3.5.0", 122 | classifiers=[ 123 | "Development Status :: 5 - Production/Stable", 124 | "Intended Audience :: Developers", 125 | "Intended Audience :: Education", 126 | "Intended Audience :: Science/Research", 127 | "License :: OSI Approved :: Apache Software License", 128 | "Operating System :: OS Independent", 129 | "Programming Language :: Python :: 3", 130 | "Programming Language :: Python :: 3.5", 131 | "Programming Language :: Python :: 3.6", 132 | "Programming Language :: Python :: 3.7", 133 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 134 | ], 135 | ) 136 | -------------------------------------------------------------------------------- /eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source init.sh 4 | 5 | MODEL_NAME_OR_PATH='xlm-roberta-large' 6 | 7 | usage() 8 | { 9 | cat << EOF 10 | usage: $0 options 11 | OPTIONS: 12 | -h Show the help and exit 13 | -n Experiment name to evaluate 14 | -m Pretrained model name or path 15 | -t task to evaluate 16 | -x For convinent usage 17 | EOF 18 | } 19 | 20 | while getopts "h:d:m:n:t:x:k:" opt 21 | do 22 | case $opt in 23 | h) 24 | usage 25 | exit 1 26 | ;; 27 | n) 28 | EXP_NAME=$OPTARG 29 | ;; 30 | m) 31 | MODEL_NAME_OR_PATH=$OPTARG 32 | ;; 33 | t) 34 | TASK=$OPTARG 35 | ;; 36 | x) 37 | OTHER_ARGS=$OPTARG 38 | ;; 39 | esac 40 | done 41 | 42 | DATA_DIR=$DATA_ROOT/data_raw 43 | if [[ ! -d $DATA_DIR ]]; then 44 | echo "$DATA_DIR not exist" 45 | exit 1 46 | fi 47 | 48 | OUTPUT_DIR=$DATA_ROOT/outputs/$EXP_NAME 49 | if [[ ! -d $OUTPUT_DIR ]]; then 50 | echo "$OUTPUT_DIR not exist, please specify it" 51 | exit 1 52 | fi 53 | 54 | xnli() { 55 | python ./examples/run_xcls.py \ 56 | --task_name xnli \ 57 | --model_type filter \ 58 | --data_dir $DATA_DIR/xnli \ 59 | --model_name_or_path $MODEL_NAME_OR_PATH \ 60 | --language ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh \ 61 | --train_language en \ 62 | --do_eval \ 63 | --eval_splits 'valid' \ 64 | --max_seq_length 256 \ 65 | --output_dir $OUTPUT_DIR \ 66 | --per_gpu_eval_batch_size 64 \ 67 | --filter_m 1 --filter_k 1 \ 68 | ${OTHER_ARGS} 69 | } 70 | 71 | pawsx() { 72 | python ./examples/run_xcls.py \ 73 | --task_name pawsx \ 74 | --data_dir $DATA_DIR/pawsx \ 75 | --model_type filter \ 76 | --model_name_or_path $MODEL_NAME_OR_PATH \ 77 | --language de,en,es,fr,ja,ko,zh \ 78 | --train_language en \ 79 | --do_eval \ 80 | --eval_splits valid \ 81 | --max_seq_length 256 \ 82 | --output_dir $OUTPUT_DIR \ 83 | --per_gpu_eval_batch_size 64 \ 84 | --filter_m 1 --filter_k 1 \ 85 | ${OTHER_ARGS} 86 | } 87 | 88 | # mlqa and xquad share the same training set 89 | mlqa() { 90 | python ./examples/run_xqa.py \ 91 | --task_name mlqa \ 92 | --data_dir $DATA_DIR \ 93 | --model_type filter \ 94 | --model_name_or_path $MODEL_NAME_OR_PATH \ 95 | --language en,es,de,ar,hi,vi,zh \ 96 | --train_language en \ 97 | --do_eval \ 98 | --eval_splits dev \ 99 | --do_lower_case \ 100 | --per_gpu_eval_batch_size 64 \ 101 | --max_seq_length 384 \ 102 | --doc_stride 128 \ 103 | --output_dir $OUTPUT_DIR \ 104 | --threads 8 \ 105 | --filter_m 1 --filter_k 20 \ 106 | ${OTHER_ARGS} 107 | } 108 | 109 | xquad() { 110 | python ./examples/run_xqa.py \ 111 | --task_name xquad \ 112 | --model_type filter \ 113 | --model_name_or_path $MODEL_NAME_OR_PATH \ 114 | --do_eval \ 115 | --eval_splits 'test' \ 116 | --do_lower_case \ 117 | --language ar,de,el,en,es,hi,ru,th,tr,vi,zh \ 118 | --train_language en \ 119 | --data_dir $DATA_DIR \ 120 | --per_gpu_eval_batch_size 64 \ 121 | --max_seq_length 384 \ 122 | --doc_stride 128 \ 123 | --output_dir $OUTPUT_DIR \ 124 | --threads 8 \ 125 | --filter_m 1 --filter_k 20 \ 126 | ${OTHER_ARGS} 127 | } 128 | 129 | tydiqa() { 130 | python ./examples/run_xqa.py \ 131 | --task_name tydiqa \ 132 | --model_type filter \ 133 | --model_name_or_path $MODEL_NAME_OR_PATH \ 134 | --do_eval \ 135 | --do_lower_case \ 136 | --language ar,bn,en,fi,id,ko,ru,sw,te \ 137 | --eval_splits dev \ 138 | --train_language en \ 139 | --data_dir $DATA_DIR \ 140 | --per_gpu_eval_batch_size 64 \ 141 | --max_seq_length 384 \ 142 | --doc_stride 128 \ 143 | --output_dir $OUTPUT_DIR \ 144 | --threads 8 \ 145 | --filter_m 1 --filter_k 20 \ 146 | ${OTHER_ARGS} 147 | } 148 | 149 | udpos() { 150 | python ./examples/run_xtag.py \ 151 | --task_name udpos \ 152 | --model_type filter \ 153 | --data_dir $DATA_DIR/udpos/udpos_processed_maxlen128 \ 154 | --labels $DATA_DIR/udpos/udpos_processed_maxlen128/labels.txt \ 155 | --model_name_or_path $MODEL_NAME_OR_PATH \ 156 | --output_dir $OUTPUT_DIR \ 157 | --train_language en \ 158 | --language 'af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh' \ 159 | --eval_splits dev \ 160 | --max_seq_length 128 \ 161 | --per_gpu_eval_batch_size 64 \ 162 | --do_eval \ 163 | --filter_m 1 --filter_k 1 \ 164 | ${OTHER_ARGS} 165 | } 166 | 167 | panx() { 168 | python ./examples/run_xtag.py \ 169 | --task_name panx \ 170 | --model_type filter \ 171 | --data_dir $DATA_DIR/panx/panx_processed_maxlen128 \ 172 | --labels $DATA_DIR/panx/panx_processed_maxlen128/labels.txt \ 173 | --model_name_or_path $MODEL_NAME_OR_PATH \ 174 | --output_dir $OUTPUT_DIR \ 175 | --language ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu \ 176 | --eval_splits dev \ 177 | --train_language en \ 178 | --max_seq_length 128 \ 179 | --per_gpu_eval_batch_size 64 \ 180 | --do_eval \ 181 | --filter_m 1 --filter_k 1 \ 182 | ${OTHER_ARGS} 183 | } 184 | 185 | for task in xnli pawsx mlqa xquad tydiqa panx udpos; do 186 | if [[ ${TASK:-"xnli"} == $task ]]; then 187 | $task 188 | fi 189 | done 190 | -------------------------------------------------------------------------------- /src/transformers/tokenization_flaubert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for Flaubert, based on XLM.""" 16 | 17 | 18 | import logging 19 | import unicodedata 20 | 21 | import six 22 | 23 | from .tokenization_xlm import XLMTokenizer 24 | 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | VOCAB_FILES_NAMES = { 29 | "vocab_file": "vocab.json", 30 | "merges_file": "merges.txt", 31 | } 32 | 33 | PRETRAINED_VOCAB_FILES_MAP = { 34 | "vocab_file": { 35 | "flaubert-small-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/vocab.json", 36 | "flaubert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/vocab.json", 37 | "flaubert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/vocab.json", 38 | "flaubert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/vocab.json", 39 | }, 40 | "merges_file": { 41 | "flaubert-small-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/merges.txt", 42 | "flaubert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/merges.txt", 43 | "flaubert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/merges.txt", 44 | "flaubert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/merges.txt", 45 | }, 46 | } 47 | 48 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 49 | "flaubert-small-cased": 512, 50 | "flaubert-base-uncased": 512, 51 | "flaubert-base-cased": 512, 52 | "flaubert-large-cased": 512, 53 | } 54 | 55 | PRETRAINED_INIT_CONFIGURATION = { 56 | "flaubert-small-cased": {"do_lowercase": False}, 57 | "flaubert-base-uncased": {"do_lowercase": True}, 58 | "flaubert-base-cased": {"do_lowercase": False}, 59 | "flaubert-large-cased": {"do_lowercase": False}, 60 | } 61 | 62 | 63 | def convert_to_unicode(text): 64 | """ 65 | Converts `text` to Unicode (if it's not already), assuming UTF-8 input. 66 | """ 67 | # six_ensure_text is copied from https://github.com/benjaminp/six 68 | def six_ensure_text(s, encoding="utf-8", errors="strict"): 69 | if isinstance(s, six.binary_type): 70 | return s.decode(encoding, errors) 71 | elif isinstance(s, six.text_type): 72 | return s 73 | else: 74 | raise TypeError("not expecting type '%s'" % type(s)) 75 | 76 | return six_ensure_text(text, encoding="utf-8", errors="ignore") 77 | 78 | 79 | class FlaubertTokenizer(XLMTokenizer): 80 | """ 81 | BPE tokenizer for Flaubert 82 | 83 | - Moses preprocessing & tokenization 84 | - Normalize all inputs text 85 | - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ 86 | (ex: "__classify__") to a vocabulary 87 | - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies) 88 | 89 | This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples 90 | and documentation regarding arguments. 91 | """ 92 | 93 | vocab_files_names = VOCAB_FILES_NAMES 94 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 95 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 96 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 97 | 98 | def __init__(self, do_lowercase=False, **kwargs): 99 | super().__init__(**kwargs) 100 | self.do_lowercase = do_lowercase 101 | self.do_lowercase_and_remove_accent = False 102 | 103 | def preprocess_text(self, text): 104 | text = text.replace("``", '"').replace("''", '"') 105 | text = convert_to_unicode(text) 106 | text = unicodedata.normalize("NFC", text) 107 | 108 | if self.do_lowercase: 109 | text = text.lower() 110 | 111 | return text 112 | 113 | def _tokenize(self, text, bypass_tokenizer=False): 114 | """ 115 | Tokenize a string given language code using Moses. 116 | 117 | Details of tokenization: 118 | - [sacremoses](https://github.com/alvations/sacremoses): port of Moses 119 | - Install with `pip install sacremoses` 120 | 121 | Args: 122 | - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) (bool). If True, we only apply BPE. 123 | 124 | Returns: 125 | List of tokens. 126 | """ 127 | lang = "fr" 128 | if lang and self.lang2id and lang not in self.lang2id: 129 | logger.error( 130 | "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model." 131 | ) 132 | 133 | if bypass_tokenizer: 134 | text = text.split() 135 | else: 136 | text = self.preprocess_text(text) 137 | text = self.moses_pipeline(text, lang=lang) 138 | text = self.moses_tokenize(text, lang=lang) 139 | 140 | split_tokens = [] 141 | for token in text: 142 | if token: 143 | split_tokens.extend([t for t in self.bpe(token).split(" ")]) 144 | 145 | return split_tokens 146 | -------------------------------------------------------------------------------- /src/transformers/modeling_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 Inria, Facebook AI Research and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """PyTorch CamemBERT model. """ 17 | 18 | import logging 19 | 20 | from .configuration_camembert import CamembertConfig 21 | from .file_utils import add_start_docstrings 22 | from .modeling_roberta import ( 23 | RobertaForMaskedLM, 24 | RobertaForMultipleChoice, 25 | RobertaForQuestionAnswering, 26 | RobertaForSequenceClassification, 27 | RobertaForTokenClassification, 28 | RobertaModel, 29 | ) 30 | 31 | 32 | logger = logging.getLogger(__name__) 33 | 34 | CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 35 | "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-pytorch_model.bin", 36 | "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/pytorch_model.bin", 37 | "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/pytorch_model.bin", 38 | } 39 | 40 | CAMEMBERT_START_DOCSTRING = r""" 41 | 42 | This model is a PyTorch `torch.nn.Module `_ sub-class. 43 | Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general 44 | usage and behavior. 45 | 46 | Parameters: 47 | config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the 48 | model. Initializing with a config file does not load the weights associated with the model, only the 49 | configuration. 50 | Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. 51 | """ 52 | 53 | 54 | @add_start_docstrings( 55 | "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", 56 | CAMEMBERT_START_DOCSTRING, 57 | ) 58 | class CamembertModel(RobertaModel): 59 | """ 60 | This class overrides :class:`~transformers.RobertaModel`. Please check the 61 | superclass for the appropriate documentation alongside usage examples. 62 | """ 63 | 64 | config_class = CamembertConfig 65 | pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP 66 | 67 | 68 | @add_start_docstrings( 69 | """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING, 70 | ) 71 | class CamembertForMaskedLM(RobertaForMaskedLM): 72 | """ 73 | This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the 74 | superclass for the appropriate documentation alongside usage examples. 75 | """ 76 | 77 | config_class = CamembertConfig 78 | pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP 79 | 80 | 81 | @add_start_docstrings( 82 | """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer 83 | on top of the pooled output) e.g. for GLUE tasks. """, 84 | CAMEMBERT_START_DOCSTRING, 85 | ) 86 | class CamembertForSequenceClassification(RobertaForSequenceClassification): 87 | """ 88 | This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the 89 | superclass for the appropriate documentation alongside usage examples. 90 | """ 91 | 92 | config_class = CamembertConfig 93 | pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP 94 | 95 | 96 | @add_start_docstrings( 97 | """CamemBERT Model with a multiple choice classification head on top (a linear layer on top of 98 | the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, 99 | CAMEMBERT_START_DOCSTRING, 100 | ) 101 | class CamembertForMultipleChoice(RobertaForMultipleChoice): 102 | """ 103 | This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the 104 | superclass for the appropriate documentation alongside usage examples. 105 | """ 106 | 107 | config_class = CamembertConfig 108 | pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP 109 | 110 | 111 | @add_start_docstrings( 112 | """CamemBERT Model with a token classification head on top (a linear layer on top of 113 | the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, 114 | CAMEMBERT_START_DOCSTRING, 115 | ) 116 | class CamembertForTokenClassification(RobertaForTokenClassification): 117 | """ 118 | This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the 119 | superclass for the appropriate documentation alongside usage examples. 120 | """ 121 | 122 | config_class = CamembertConfig 123 | pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP 124 | 125 | 126 | @add_start_docstrings( 127 | """CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD 128 | (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits` """, 129 | CAMEMBERT_START_DOCSTRING, 130 | ) 131 | class CamembertForQuestionAnswering(RobertaForQuestionAnswering): 132 | """ 133 | This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the 134 | superclass for the appropriate documentation alongside usage examples. 135 | """ 136 | 137 | config_class = CamembertConfig 138 | pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP 139 | -------------------------------------------------------------------------------- /src/transformers/configuration_ctrl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Salesforce and HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Salesforce CTRL configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"} 26 | 27 | 28 | class CTRLConfig(PretrainedConfig): 29 | """ 30 | This is the configuration class to store the configuration of an :class:`~transformers.CTRLModel`. 31 | It is used to instantiate an CTRL model according to the specified arguments, defining the model 32 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of 33 | the `ctrl `__ architecture from SalesForce. 34 | 35 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used 36 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` 37 | for more information. 38 | 39 | Args: 40 | vocab_size (:obj:`int`, optional, defaults to 246534): 41 | Vocabulary size of the CTRL model. Defines the different tokens that 42 | can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`. 43 | n_positions (:obj:`int`, optional, defaults to 256): 44 | The maximum sequence length that this model might ever be used with. 45 | Typically set this to something large just in case (e.g., 512 or 1024 or 2048). 46 | n_ctx (:obj:`int`, optional, defaults to 256): 47 | Dimensionality of the causal mask (usually same as n_positions). 48 | n_embd (:obj:`int`, optional, defaults to 1280): 49 | Dimensionality of the embeddings and hidden states. 50 | dff (:obj:`int`, optional, defaults to 8192): 51 | Dimensionality of the inner dimension of the FFN. 52 | n_layer (:obj:`int`, optional, defaults to 48): 53 | Number of hidden layers in the Transformer encoder. 54 | n_head (:obj:`int`, optional, defaults to 16): 55 | Number of attention heads for each attention layer in the Transformer encoder. 56 | resid_pdrop (:obj:`float`, optional, defaults to 0.1): 57 | The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. 58 | embd_pdrop (:obj:`int`, optional, defaults to 0.1): 59 | The dropout ratio for the embeddings. 60 | attn_pdrop (:obj:`float`, optional, defaults to 0.1): 61 | The dropout ratio for the attention. 62 | layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6): 63 | The epsilon to use in the layer normalization layers 64 | initializer_range (:obj:`float`, optional, defaults to 0.02): 65 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 66 | 67 | Example:: 68 | 69 | from transformers import CTRLModel, CTRLConfig 70 | 71 | # Initializing a CTRL configuration 72 | configuration = CTRLConfig() 73 | 74 | # Initializing a model from the configuration 75 | model = CTRLModel(configuration) 76 | 77 | # Accessing the model configuration 78 | configuration = model.config 79 | 80 | Attributes: 81 | pretrained_config_archive_map (Dict[str, str]): 82 | A dictionary containing all the available pre-trained checkpoints. 83 | """ 84 | 85 | pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP 86 | model_type = "ctrl" 87 | 88 | def __init__( 89 | self, 90 | vocab_size=246534, 91 | n_positions=256, 92 | n_ctx=256, 93 | n_embd=1280, 94 | dff=8192, 95 | n_layer=48, 96 | n_head=16, 97 | resid_pdrop=0.1, 98 | embd_pdrop=0.1, 99 | attn_pdrop=0.1, 100 | layer_norm_epsilon=1e-6, 101 | initializer_range=0.02, 102 | summary_type="cls_index", 103 | summary_use_proj=True, 104 | summary_activation=None, 105 | summary_proj_to_labels=True, 106 | summary_first_dropout=0.1, 107 | **kwargs 108 | ): 109 | super().__init__(**kwargs) 110 | self.vocab_size = vocab_size 111 | self.n_ctx = n_ctx 112 | self.n_positions = n_positions 113 | self.n_embd = n_embd 114 | self.n_layer = n_layer 115 | self.n_head = n_head 116 | self.dff = dff 117 | self.resid_pdrop = resid_pdrop 118 | self.embd_pdrop = embd_pdrop 119 | self.attn_pdrop = attn_pdrop 120 | self.layer_norm_epsilon = layer_norm_epsilon 121 | self.initializer_range = initializer_range 122 | 123 | self.summary_type = summary_type 124 | self.summary_use_proj = summary_use_proj 125 | self.summary_activation = summary_activation 126 | self.summary_first_dropout = summary_first_dropout 127 | self.summary_proj_to_labels = summary_proj_to_labels 128 | 129 | @property 130 | def max_position_embeddings(self): 131 | return self.n_positions 132 | 133 | @property 134 | def hidden_size(self): 135 | return self.n_embd 136 | 137 | @property 138 | def num_attention_heads(self): 139 | return self.n_head 140 | 141 | @property 142 | def num_hidden_layers(self): 143 | return self.n_layer 144 | -------------------------------------------------------------------------------- /src/transformers/commands/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | from argparse import ArgumentParser, Namespace 3 | from logging import getLogger 4 | 5 | from transformers import SingleSentenceClassificationProcessor as Processor 6 | from transformers import TextClassificationPipeline, is_tf_available, is_torch_available 7 | from transformers.commands import BaseTransformersCLICommand 8 | 9 | 10 | if not is_tf_available() and not is_torch_available(): 11 | raise RuntimeError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training") 12 | 13 | # TF training parameters 14 | USE_XLA = False 15 | USE_AMP = False 16 | 17 | 18 | def train_command_factory(args: Namespace): 19 | """ 20 | Factory function used to instantiate serving server from provided command line arguments. 21 | :return: ServeCommand 22 | """ 23 | return TrainCommand(args) 24 | 25 | 26 | class TrainCommand(BaseTransformersCLICommand): 27 | @staticmethod 28 | def register_subcommand(parser: ArgumentParser): 29 | """ 30 | Register this command to argparse so it's available for the transformer-cli 31 | :param parser: Root parser to register command-specific arguments 32 | :return: 33 | """ 34 | train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.") 35 | 36 | train_parser.add_argument( 37 | "--train_data", 38 | type=str, 39 | required=True, 40 | help="path to train (and optionally evaluation) dataset as a csv with " 41 | "tab separated labels and sentences.", 42 | ) 43 | train_parser.add_argument( 44 | "--column_label", type=int, default=0, help="Column of the dataset csv file with example labels." 45 | ) 46 | train_parser.add_argument( 47 | "--column_text", type=int, default=1, help="Column of the dataset csv file with example texts." 48 | ) 49 | train_parser.add_argument( 50 | "--column_id", type=int, default=2, help="Column of the dataset csv file with example ids." 51 | ) 52 | train_parser.add_argument( 53 | "--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)." 54 | ) 55 | 56 | train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.") 57 | train_parser.add_argument( 58 | "--validation_split", 59 | type=float, 60 | default=0.1, 61 | help="if validation dataset is not provided, fraction of train dataset " "to use as validation dataset.", 62 | ) 63 | 64 | train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.") 65 | 66 | train_parser.add_argument( 67 | "--task", type=str, default="text_classification", help="Task to train the model on." 68 | ) 69 | train_parser.add_argument( 70 | "--model", type=str, default="bert-base-uncased", help="Model's name or path to stored model." 71 | ) 72 | train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.") 73 | train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.") 74 | train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.") 75 | train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.") 76 | train_parser.set_defaults(func=train_command_factory) 77 | 78 | def __init__(self, args: Namespace): 79 | self.logger = getLogger("transformers-cli/training") 80 | 81 | self.framework = "tf" if is_tf_available() else "torch" 82 | 83 | os.makedirs(args.output, exist_ok=True) 84 | assert os.path.isdir(args.output) 85 | self.output = args.output 86 | 87 | self.column_label = args.column_label 88 | self.column_text = args.column_text 89 | self.column_id = args.column_id 90 | 91 | self.logger.info("Loading {} pipeline for {}".format(args.task, args.model)) 92 | if args.task == "text_classification": 93 | self.pipeline = TextClassificationPipeline.from_pretrained(args.model) 94 | elif args.task == "token_classification": 95 | raise NotImplementedError 96 | elif args.task == "question_answering": 97 | raise NotImplementedError 98 | 99 | self.logger.info("Loading dataset from {}".format(args.train_data)) 100 | self.train_dataset = Processor.create_from_csv( 101 | args.train_data, 102 | column_label=args.column_label, 103 | column_text=args.column_text, 104 | column_id=args.column_id, 105 | skip_first_row=args.skip_first_row, 106 | ) 107 | self.valid_dataset = None 108 | if args.validation_data: 109 | self.logger.info("Loading validation dataset from {}".format(args.validation_data)) 110 | self.valid_dataset = Processor.create_from_csv( 111 | args.validation_data, 112 | column_label=args.column_label, 113 | column_text=args.column_text, 114 | column_id=args.column_id, 115 | skip_first_row=args.skip_first_row, 116 | ) 117 | 118 | self.validation_split = args.validation_split 119 | self.train_batch_size = args.train_batch_size 120 | self.valid_batch_size = args.valid_batch_size 121 | self.learning_rate = args.learning_rate 122 | self.adam_epsilon = args.adam_epsilon 123 | 124 | def run(self): 125 | if self.framework == "tf": 126 | return self.run_tf() 127 | return self.run_torch() 128 | 129 | def run_torch(self): 130 | raise NotImplementedError 131 | 132 | def run_tf(self): 133 | self.pipeline.fit( 134 | self.train_dataset, 135 | validation_data=self.valid_dataset, 136 | validation_split=self.validation_split, 137 | learning_rate=self.learning_rate, 138 | adam_epsilon=self.adam_epsilon, 139 | train_batch_size=self.train_batch_size, 140 | valid_batch_size=self.valid_batch_size, 141 | ) 142 | 143 | # Save trained pipeline 144 | self.pipeline.save_pretrained(self.output) 145 | -------------------------------------------------------------------------------- /src/transformers/data/metrics/mlqa_evaluation_v1.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | """ Official evaluation script for the MLQA dataset. """ 8 | from __future__ import print_function 9 | from collections import Counter 10 | import string 11 | import re 12 | import argparse 13 | import json 14 | import sys 15 | import unicodedata 16 | 17 | 18 | PUNCT = {chr(i) for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')}.union(string.punctuation) 19 | WHITESPACE_LANGS = ['en', 'es', 'hi', 'vi', 'de', 'ar'] 20 | MIXED_SEGMENTATION_LANGS = ['zh'] 21 | 22 | 23 | def whitespace_tokenize(text): 24 | return text.split() 25 | 26 | 27 | def mixed_segmentation(text): 28 | segs_out = [] 29 | temp_str = "" 30 | for char in text: 31 | if re.search(r'[\u4e00-\u9fa5]', char) or char in PUNCT: 32 | if temp_str != "": 33 | ss = whitespace_tokenize(temp_str) 34 | segs_out.extend(ss) 35 | temp_str = "" 36 | segs_out.append(char) 37 | else: 38 | temp_str += char 39 | 40 | if temp_str != "": 41 | ss = whitespace_tokenize(temp_str) 42 | segs_out.extend(ss) 43 | 44 | return segs_out 45 | 46 | 47 | def normalize_answer(s, lang): 48 | """Lower text and remove punctuation, articles and extra whitespace.""" 49 | 50 | def remove_articles(text, lang): 51 | if lang == 'en': 52 | return re.sub(r'\b(a|an|the)\b', ' ', text) 53 | elif lang == 'es': 54 | return re.sub(r'\b(un|una|unos|unas|el|la|los|las)\b', ' ', text) 55 | elif lang == 'hi': 56 | return text # Hindi does not have formal articles 57 | elif lang == 'vi': 58 | return re.sub(r'\b(của|là|cái|chiếc|những)\b', ' ', text) 59 | elif lang == 'de': 60 | return re.sub(r'\b(ein|eine|einen|einem|eines|einer|der|die|das|den|dem|des)\b', ' ', text) 61 | elif lang == 'ar': 62 | return re.sub('\sال^|ال', ' ', text) 63 | elif lang == 'zh': 64 | return text # Chinese does not have formal articles 65 | else: 66 | raise Exception('Unknown Language {}'.format(lang)) 67 | 68 | def white_space_fix(text, lang): 69 | if lang in WHITESPACE_LANGS: 70 | tokens = whitespace_tokenize(text) 71 | elif lang in MIXED_SEGMENTATION_LANGS: 72 | tokens = mixed_segmentation(text) 73 | else: 74 | raise Exception('Unknown Language {}'.format(lang)) 75 | return ' '.join([t for t in tokens if t.strip() != '']) 76 | 77 | def remove_punc(text): 78 | return ''.join(ch for ch in text if ch not in PUNCT) 79 | 80 | def lower(text): 81 | return text.lower() 82 | 83 | return white_space_fix(remove_articles(remove_punc(lower(s)), lang), lang) 84 | 85 | 86 | def f1_score(prediction, ground_truth, lang): 87 | prediction_tokens = normalize_answer(prediction, lang).split() 88 | ground_truth_tokens = normalize_answer(ground_truth, lang).split() 89 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 90 | num_same = sum(common.values()) 91 | if num_same == 0: 92 | return 0 93 | precision = 1.0 * num_same / len(prediction_tokens) 94 | recall = 1.0 * num_same / len(ground_truth_tokens) 95 | f1 = (2 * precision * recall) / (precision + recall) 96 | return f1 97 | 98 | 99 | def exact_match_score(prediction, ground_truth, lang): 100 | return (normalize_answer(prediction, lang) == normalize_answer(ground_truth, lang)) 101 | 102 | 103 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths, lang): 104 | scores_for_ground_truths = [] 105 | for ground_truth in ground_truths: 106 | score = metric_fn(prediction, ground_truth, lang) 107 | scores_for_ground_truths.append(score) 108 | return max(scores_for_ground_truths) 109 | 110 | 111 | def evaluate(dataset, predictions, lang): 112 | f1 = exact_match = total = 0 113 | for article in dataset: 114 | for paragraph in article['paragraphs']: 115 | for qa in paragraph['qas']: 116 | total += 1 117 | if qa['id'] not in predictions: 118 | message = 'Unanswered question ' + qa['id'] + \ 119 | ' will receive score 0.' 120 | print(message, file=sys.stderr) 121 | continue 122 | ground_truths = list(map(lambda x: x['text'], qa['answers'])) 123 | prediction = predictions[qa['id']] 124 | exact_match += metric_max_over_ground_truths( 125 | exact_match_score, prediction, ground_truths, lang) 126 | f1 += metric_max_over_ground_truths( 127 | f1_score, prediction, ground_truths, lang) 128 | 129 | exact_match = 100.0 * exact_match / total 130 | f1 = 100.0 * f1 / total 131 | 132 | return {'exact_match': exact_match, 'f1': f1} 133 | 134 | 135 | def evaluate_with_path(dataset_file, prediction_file, answer_language): 136 | with open(dataset_file) as dataset_file_reader: 137 | dataset_json = json.load(dataset_file_reader) 138 | dataset = dataset_json['data'] 139 | with open(prediction_file) as prediction_file_reader: 140 | predictions = json.load(prediction_file_reader) 141 | return evaluate(dataset, predictions, answer_language) 142 | 143 | if __name__ == '__main__': 144 | expected_version = '1.0' 145 | parser = argparse.ArgumentParser( 146 | description='Evaluation for MLQA ' + expected_version) 147 | parser.add_argument('dataset_file', help='Dataset file') 148 | parser.add_argument('prediction_file', help='Prediction File') 149 | parser.add_argument('answer_language', help='Language code of answer language') 150 | 151 | args = parser.parse_args() 152 | with open(args.dataset_file) as dataset_file: 153 | dataset_json = json.load(dataset_file) 154 | if (str(dataset_json['version']) != expected_version): 155 | print('Evaluation expects v-' + expected_version + 156 | ', but got dataset with v-' + dataset_json['version'], 157 | file=sys.stderr) 158 | dataset = dataset_json['data'] 159 | with open(args.prediction_file) as prediction_file: 160 | predictions = json.load(prediction_file) 161 | print(json.dumps(evaluate(dataset, predictions, args.answer_language))) 162 | -------------------------------------------------------------------------------- /src/transformers/hf_api.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import io 18 | import os 19 | from os.path import expanduser 20 | from typing import List 21 | 22 | import requests 23 | from tqdm import tqdm 24 | 25 | 26 | ENDPOINT = "https://huggingface.co" 27 | 28 | 29 | class S3Obj: 30 | def __init__(self, filename: str, LastModified: str, ETag: str, Size: int, **kwargs): 31 | self.filename = filename 32 | self.LastModified = LastModified 33 | self.ETag = ETag 34 | self.Size = Size 35 | 36 | 37 | class PresignedUrl: 38 | def __init__(self, write: str, access: str, type: str, **kwargs): 39 | self.write = write 40 | self.access = access 41 | self.type = type # mime-type to send to S3. 42 | 43 | 44 | class HfApi: 45 | def __init__(self, endpoint=None): 46 | self.endpoint = endpoint if endpoint is not None else ENDPOINT 47 | 48 | def login(self, username: str, password: str) -> str: 49 | """ 50 | Call HF API to sign in a user and get a token if credentials are valid. 51 | 52 | Outputs: 53 | token if credentials are valid 54 | 55 | Throws: 56 | requests.exceptions.HTTPError if credentials are invalid 57 | """ 58 | path = "{}/api/login".format(self.endpoint) 59 | r = requests.post(path, json={"username": username, "password": password}) 60 | r.raise_for_status() 61 | d = r.json() 62 | return d["token"] 63 | 64 | def whoami(self, token: str) -> str: 65 | """ 66 | Call HF API to know "whoami" 67 | """ 68 | path = "{}/api/whoami".format(self.endpoint) 69 | r = requests.get(path, headers={"authorization": "Bearer {}".format(token)}) 70 | r.raise_for_status() 71 | d = r.json() 72 | return d["user"] 73 | 74 | def logout(self, token: str) -> None: 75 | """ 76 | Call HF API to log out. 77 | """ 78 | path = "{}/api/logout".format(self.endpoint) 79 | r = requests.post(path, headers={"authorization": "Bearer {}".format(token)}) 80 | r.raise_for_status() 81 | 82 | def presign(self, token: str, filename: str) -> PresignedUrl: 83 | """ 84 | Call HF API to get a presigned url to upload `filename` to S3. 85 | """ 86 | path = "{}/api/presign".format(self.endpoint) 87 | r = requests.post(path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename}) 88 | r.raise_for_status() 89 | d = r.json() 90 | return PresignedUrl(**d) 91 | 92 | def presign_and_upload(self, token: str, filename: str, filepath: str) -> str: 93 | """ 94 | Get a presigned url, then upload file to S3. 95 | 96 | Outputs: 97 | url: Read-only url for the stored file on S3. 98 | """ 99 | urls = self.presign(token, filename=filename) 100 | # streaming upload: 101 | # https://2.python-requests.org/en/master/user/advanced/#streaming-uploads 102 | # 103 | # Even though we presign with the correct content-type, 104 | # the client still has to specify it when uploading the file. 105 | with open(filepath, "rb") as f: 106 | pf = TqdmProgressFileReader(f) 107 | data = f if pf.total_size > 0 else "" 108 | 109 | r = requests.put(urls.write, data=data, headers={"content-type": urls.type}) 110 | r.raise_for_status() 111 | pf.close() 112 | return urls.access 113 | 114 | def list_objs(self, token: str) -> List[S3Obj]: 115 | """ 116 | Call HF API to list all stored files for user. 117 | """ 118 | path = "{}/api/listObjs".format(self.endpoint) 119 | r = requests.get(path, headers={"authorization": "Bearer {}".format(token)}) 120 | r.raise_for_status() 121 | d = r.json() 122 | return [S3Obj(**x) for x in d] 123 | 124 | def delete_obj(self, token: str, filename: str): 125 | """ 126 | Call HF API to delete a file stored by user 127 | """ 128 | path = "{}/api/deleteObj".format(self.endpoint) 129 | r = requests.delete(path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename}) 130 | r.raise_for_status() 131 | 132 | 133 | class TqdmProgressFileReader: 134 | """ 135 | Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`) 136 | and override `f.read()` so as to display a tqdm progress bar. 137 | 138 | see github.com/huggingface/transformers/pull/2078#discussion_r354739608 139 | for implementation details. 140 | """ 141 | 142 | def __init__(self, f: io.BufferedReader): 143 | self.f = f 144 | self.total_size = os.fstat(f.fileno()).st_size 145 | self.pbar = tqdm(total=self.total_size, leave=False) 146 | self.read = f.read 147 | f.read = self._read 148 | 149 | def _read(self, n=-1): 150 | self.pbar.update(n) 151 | return self.read(n) 152 | 153 | def close(self): 154 | self.pbar.close() 155 | 156 | 157 | class HfFolder: 158 | path_token = expanduser("~/.huggingface/token") 159 | 160 | @classmethod 161 | def save_token(cls, token): 162 | """ 163 | Save token, creating folder as needed. 164 | """ 165 | os.makedirs(os.path.dirname(cls.path_token), exist_ok=True) 166 | with open(cls.path_token, "w+") as f: 167 | f.write(token) 168 | 169 | @classmethod 170 | def get_token(cls): 171 | """ 172 | Get token or None if not existent. 173 | """ 174 | try: 175 | with open(cls.path_token, "r") as f: 176 | return f.read() 177 | except FileNotFoundError: 178 | pass 179 | 180 | @classmethod 181 | def delete_token(cls): 182 | """ 183 | Delete token. 184 | Do not fail if token does not exist. 185 | """ 186 | try: 187 | os.remove(cls.path_token) 188 | except FileNotFoundError: 189 | pass 190 | -------------------------------------------------------------------------------- /src/transformers/commands/convert.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser, Namespace 2 | from logging import getLogger 3 | 4 | from transformers.commands import BaseTransformersCLICommand 5 | 6 | 7 | def convert_command_factory(args: Namespace): 8 | """ 9 | Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint. 10 | :return: ServeCommand 11 | """ 12 | return ConvertCommand( 13 | args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name 14 | ) 15 | 16 | 17 | class ConvertCommand(BaseTransformersCLICommand): 18 | @staticmethod 19 | def register_subcommand(parser: ArgumentParser): 20 | """ 21 | Register this command to argparse so it's available for the transformer-cli 22 | :param parser: Root parser to register command-specific arguments 23 | :return: 24 | """ 25 | train_parser = parser.add_parser( 26 | "convert", 27 | help="CLI tool to run convert model from original " 28 | "author checkpoints to Transformers PyTorch checkpoints.", 29 | ) 30 | train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.") 31 | train_parser.add_argument( 32 | "--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder." 33 | ) 34 | train_parser.add_argument( 35 | "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch savd model output." 36 | ) 37 | train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.") 38 | train_parser.add_argument( 39 | "--finetuning_task_name", 40 | type=str, 41 | default=None, 42 | help="Optional fine-tuning task name if the TF model was a finetuned model.", 43 | ) 44 | train_parser.set_defaults(func=convert_command_factory) 45 | 46 | def __init__( 47 | self, 48 | model_type: str, 49 | tf_checkpoint: str, 50 | pytorch_dump_output: str, 51 | config: str, 52 | finetuning_task_name: str, 53 | *args 54 | ): 55 | self._logger = getLogger("transformers-cli/converting") 56 | 57 | self._logger.info("Loading model {}".format(model_type)) 58 | self._model_type = model_type 59 | self._tf_checkpoint = tf_checkpoint 60 | self._pytorch_dump_output = pytorch_dump_output 61 | self._config = config 62 | self._finetuning_task_name = finetuning_task_name 63 | 64 | def run(self): 65 | if self._model_type == "bert": 66 | try: 67 | from transformers.convert_bert_original_tf_checkpoint_to_pytorch import ( 68 | convert_tf_checkpoint_to_pytorch, 69 | ) 70 | except ImportError: 71 | msg = ( 72 | "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 73 | "In that case, it requires TensorFlow to be installed. Please see " 74 | "https://www.tensorflow.org/install/ for installation instructions." 75 | ) 76 | raise ImportError(msg) 77 | 78 | convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) 79 | elif self._model_type == "gpt": 80 | from transformers.convert_openai_original_tf_checkpoint_to_pytorch import ( 81 | convert_openai_checkpoint_to_pytorch, 82 | ) 83 | 84 | convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) 85 | elif self._model_type == "transfo_xl": 86 | try: 87 | from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import ( 88 | convert_transfo_xl_checkpoint_to_pytorch, 89 | ) 90 | except ImportError: 91 | msg = ( 92 | "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 93 | "In that case, it requires TensorFlow to be installed. Please see " 94 | "https://www.tensorflow.org/install/ for installation instructions." 95 | ) 96 | raise ImportError(msg) 97 | 98 | if "ckpt" in self._tf_checkpoint.lower(): 99 | TF_CHECKPOINT = self._tf_checkpoint 100 | TF_DATASET_FILE = "" 101 | else: 102 | TF_DATASET_FILE = self._tf_checkpoint 103 | TF_CHECKPOINT = "" 104 | convert_transfo_xl_checkpoint_to_pytorch( 105 | TF_CHECKPOINT, self._config, self._pytorch_dump_output, TF_DATASET_FILE 106 | ) 107 | elif self._model_type == "gpt2": 108 | try: 109 | from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import ( 110 | convert_gpt2_checkpoint_to_pytorch, 111 | ) 112 | except ImportError: 113 | msg = ( 114 | "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 115 | "In that case, it requires TensorFlow to be installed. Please see " 116 | "https://www.tensorflow.org/install/ for installation instructions." 117 | ) 118 | raise ImportError(msg) 119 | 120 | convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) 121 | elif self._model_type == "xlnet": 122 | try: 123 | from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import ( 124 | convert_xlnet_checkpoint_to_pytorch, 125 | ) 126 | except ImportError: 127 | msg = ( 128 | "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 129 | "In that case, it requires TensorFlow to be installed. Please see " 130 | "https://www.tensorflow.org/install/ for installation instructions." 131 | ) 132 | raise ImportError(msg) 133 | 134 | convert_xlnet_checkpoint_to_pytorch( 135 | self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name 136 | ) 137 | elif self._model_type == "xlm": 138 | from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import ( 139 | convert_xlm_checkpoint_to_pytorch, 140 | ) 141 | 142 | convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output) 143 | else: 144 | raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, transfo_xl, xlnet, xlm]") 145 | -------------------------------------------------------------------------------- /src/transformers/configuration_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ DistilBERT model configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 26 | "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", 27 | "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json", 28 | "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json", 29 | "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-distilled-squad-config.json", 30 | "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json", 31 | "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json", 32 | "distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json", 33 | } 34 | 35 | 36 | class DistilBertConfig(PretrainedConfig): 37 | r""" 38 | This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`. 39 | It is used to instantiate a DistilBERT model according to the specified arguments, defining the model 40 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of 41 | the DistilBERT `distilbert-base-uncased `__ architecture. 42 | 43 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used 44 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` 45 | for more information. 46 | 47 | 48 | Args: 49 | vocab_size (:obj:`int`, optional, defaults to 30522): 50 | Vocabulary size of the DistilBERT model. Defines the different tokens that 51 | can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`. 52 | max_position_embeddings (:obj:`int`, optional, defaults to 512): 53 | The maximum sequence length that this model might ever be used with. 54 | Typically set this to something large just in case (e.g., 512 or 1024 or 2048). 55 | sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`): 56 | Whether to use sinusoidal positional embeddings. 57 | n_layers (:obj:`int`, optional, defaults to 6): 58 | Number of hidden layers in the Transformer encoder. 59 | n_heads (:obj:`int`, optional, defaults to 12): 60 | Number of attention heads for each attention layer in the Transformer encoder. 61 | dim (:obj:`int`, optional, defaults to 768): 62 | Dimensionality of the encoder layers and the pooler layer. 63 | hidden_dim (:obj:`int`, optional, defaults to 3072): 64 | The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 65 | dropout (:obj:`float`, optional, defaults to 0.1): 66 | The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. 67 | attention_dropout (:obj:`float`, optional, defaults to 0.1): 68 | The dropout ratio for the attention probabilities. 69 | activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): 70 | The non-linear activation function (function or string) in the encoder and pooler. 71 | If string, "gelu", "relu", "swish" and "gelu_new" are supported. 72 | initializer_range (:obj:`float`, optional, defaults to 0.02): 73 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 74 | qa_dropout (:obj:`float`, optional, defaults to 0.1): 75 | The dropout probabilities used in the question answering model 76 | :class:`~tranformers.DistilBertForQuestionAnswering`. 77 | seq_classif_dropout (:obj:`float`, optional, defaults to 0.2): 78 | The dropout probabilities used in the sequence classification model 79 | :class:`~tranformers.DistilBertForSequenceClassification`. 80 | 81 | Example:: 82 | 83 | from transformers import DistilBertModel, DistilBertConfig 84 | 85 | # Initializing a DistilBERT configuration 86 | configuration = DistilBertConfig() 87 | 88 | # Initializing a model from the configuration 89 | model = DistilBertModel(configuration) 90 | 91 | # Accessing the model configuration 92 | configuration = model.config 93 | 94 | Attributes: 95 | pretrained_config_archive_map (Dict[str, str]): 96 | A dictionary containing all the available pre-trained checkpoints. 97 | """ 98 | pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 99 | model_type = "distilbert" 100 | 101 | def __init__( 102 | self, 103 | vocab_size=30522, 104 | max_position_embeddings=512, 105 | sinusoidal_pos_embds=False, 106 | n_layers=6, 107 | n_heads=12, 108 | dim=768, 109 | hidden_dim=4 * 768, 110 | dropout=0.1, 111 | attention_dropout=0.1, 112 | activation="gelu", 113 | initializer_range=0.02, 114 | qa_dropout=0.1, 115 | seq_classif_dropout=0.2, 116 | **kwargs 117 | ): 118 | super().__init__(**kwargs) 119 | self.vocab_size = vocab_size 120 | self.max_position_embeddings = max_position_embeddings 121 | self.sinusoidal_pos_embds = sinusoidal_pos_embds 122 | self.n_layers = n_layers 123 | self.n_heads = n_heads 124 | self.dim = dim 125 | self.hidden_dim = hidden_dim 126 | self.dropout = dropout 127 | self.attention_dropout = attention_dropout 128 | self.activation = activation 129 | self.initializer_range = initializer_range 130 | self.qa_dropout = qa_dropout 131 | self.seq_classif_dropout = seq_classif_dropout 132 | 133 | @property 134 | def hidden_size(self): 135 | return self.dim 136 | 137 | @property 138 | def num_attention_heads(self): 139 | return self.n_heads 140 | 141 | @property 142 | def num_hidden_layers(self): 143 | return self.n_layers 144 | -------------------------------------------------------------------------------- /src/transformers/tokenization_t5.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 T5 Authors and HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Tokenization class for model T5.""" 16 | 17 | 18 | import logging 19 | import os 20 | import re 21 | from shutil import copyfile 22 | 23 | from .tokenization_utils import PreTrainedTokenizer 24 | 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | SPIECE_UNDERLINE = "▁" 29 | 30 | #################################################### 31 | # Mapping from the keyword arguments names of Tokenizer `__init__` 32 | # to file names for serializing Tokenizer instances 33 | #################################################### 34 | VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} 35 | 36 | #################################################### 37 | # Mapping from the keyword arguments names of Tokenizer `__init__` 38 | # to pretrained vocabulary URL for all the model shortcut names. 39 | #################################################### 40 | PRETRAINED_VOCAB_FILES_MAP = { 41 | "vocab_file": { 42 | "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", 43 | "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", 44 | "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", 45 | "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", 46 | "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", 47 | } 48 | } 49 | 50 | #################################################### 51 | # Mapping from model shortcut names to max length of inputs 52 | #################################################### 53 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 54 | "t5-small": 512, 55 | "t5-base": 512, 56 | "t5-large": 512, 57 | "t5-3b": 512, 58 | "t5-11b": 512, 59 | } 60 | 61 | 62 | class T5Tokenizer(PreTrainedTokenizer): 63 | """ 64 | SentencePiece based tokenizer. Peculiarities: 65 | 66 | - requires `SentencePiece `_ 67 | - `extra_ids` add a number of extra ids added to the end of the vocabulary for use as sentinels. 68 | These tokens are accessible as `` where `{%d}` is a number between 0 and extra_ids-1. 69 | Extra tokens are indexed from the end of the vocabulary up to beginnning ( is the last token in the vocabulary) 70 | (like in T5 preprocessing 71 | see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117) 72 | """ 73 | 74 | vocab_files_names = VOCAB_FILES_NAMES 75 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 76 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 77 | 78 | def __init__( 79 | self, 80 | vocab_file, 81 | eos_token="", 82 | unk_token="", 83 | pad_token="", 84 | extra_ids=100, 85 | additional_special_tokens=None, 86 | **kwargs 87 | ): 88 | # Add extra_ids to the special token list 89 | if extra_ids > 0: 90 | if additional_special_tokens is None: 91 | additional_special_tokens = [] 92 | additional_special_tokens.extend(["".format(i) for i in range(extra_ids)]) 93 | 94 | super().__init__( 95 | eos_token=eos_token, 96 | unk_token=unk_token, 97 | pad_token=pad_token, 98 | additional_special_tokens=additional_special_tokens, 99 | **kwargs, 100 | ) 101 | self.max_len_single_sentence = ( 102 | self.max_len 103 | ) # no default special tokens - you can update this value if you add special tokens 104 | self.max_len_sentences_pair = ( 105 | self.max_len 106 | ) # no default special tokens - you can update this value if you add special tokens 107 | 108 | try: 109 | import sentencepiece as spm 110 | except ImportError: 111 | logger.warning( 112 | "You need to install SentencePiece to use T5Tokenizer:" 113 | "https://github.com/google/sentencepiece" 114 | "pip install sentencepiece" 115 | ) 116 | raise 117 | 118 | self.vocab_file = vocab_file 119 | self._extra_ids = extra_ids 120 | 121 | self.sp_model = spm.SentencePieceProcessor() 122 | self.sp_model.Load(vocab_file) 123 | 124 | @property 125 | def vocab_size(self): 126 | return self.sp_model.get_piece_size() + self._extra_ids 127 | 128 | def get_vocab(self): 129 | vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} 130 | vocab.update(self.added_tokens_encoder) 131 | return vocab 132 | 133 | def __getstate__(self): 134 | state = self.__dict__.copy() 135 | state["sp_model"] = None 136 | return state 137 | 138 | def __setstate__(self, d): 139 | self.__dict__ = d 140 | try: 141 | import sentencepiece as spm 142 | except ImportError: 143 | logger.warning( 144 | "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" 145 | "pip install sentencepiece" 146 | ) 147 | raise 148 | self.sp_model = spm.SentencePieceProcessor() 149 | self.sp_model.Load(self.vocab_file) 150 | 151 | def _tokenize(self, text, sample=False): 152 | """ Take as input a string and return a list of strings (tokens) for words/sub-words 153 | """ 154 | if not sample: 155 | pieces = self.sp_model.EncodeAsPieces(text) 156 | else: 157 | pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) 158 | return pieces 159 | 160 | def _convert_token_to_id(self, token): 161 | """ Converts a token (str) in an id using the vocab. """ 162 | if token.startswith("", token) 164 | num = int(match.group(1)) 165 | return self.vocab_size - num - 1 166 | return self.sp_model.piece_to_id(token) 167 | 168 | def _convert_id_to_token(self, index): 169 | """Converts an index (integer) in a token (str) using the vocab.""" 170 | if index < self.sp_model.get_piece_size(): 171 | token = self.sp_model.IdToPiece(index) 172 | else: 173 | token = "".format(self.vocab_size - 1 - index) 174 | return token 175 | 176 | def convert_tokens_to_string(self, tokens): 177 | """ Converts a sequence of tokens (string) in a single string. """ 178 | out_string = self.sp_model.decode_pieces(tokens) 179 | return out_string 180 | 181 | def save_vocabulary(self, save_directory): 182 | """ Save the sentencepiece vocabulary (copy original file) and special tokens file 183 | to a directory. 184 | """ 185 | if not os.path.isdir(save_directory): 186 | logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) 187 | return 188 | out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) 189 | 190 | if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): 191 | copyfile(self.vocab_file, out_vocab_file) 192 | 193 | return (out_vocab_file,) 194 | -------------------------------------------------------------------------------- /src/transformers/configuration_albert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ ALBERT model configuration """ 17 | 18 | from .configuration_utils import PretrainedConfig 19 | 20 | 21 | ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 22 | "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json", 23 | "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json", 24 | "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json", 25 | "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json", 26 | "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json", 27 | "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json", 28 | "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json", 29 | "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json", 30 | } 31 | 32 | 33 | class AlbertConfig(PretrainedConfig): 34 | r""" 35 | This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`. 36 | It is used to instantiate an ALBERT model according to the specified arguments, defining the model 37 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of 38 | the ALBERT `xxlarge `__ architecture. 39 | 40 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used 41 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` 42 | for more information. 43 | 44 | 45 | Args: 46 | vocab_size (:obj:`int`, optional, defaults to 30000): 47 | Vocabulary size of the ALBERT model. Defines the different tokens that 48 | can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. 49 | embedding_size (:obj:`int`, optional, defaults to 128): 50 | Dimensionality of vocabulary embeddings. 51 | hidden_size (:obj:`int`, optional, defaults to 4096): 52 | Dimensionality of the encoder layers and the pooler layer. 53 | num_hidden_layers (:obj:`int`, optional, defaults to 12): 54 | Number of hidden layers in the Transformer encoder. 55 | num_hidden_groups (:obj:`int`, optional, defaults to 1): 56 | Number of groups for the hidden layers, parameters in the same group are shared. 57 | num_attention_heads (:obj:`int`, optional, defaults to 64): 58 | Number of attention heads for each attention layer in the Transformer encoder. 59 | intermediate_size (:obj:`int`, optional, defaults to 16384): 60 | The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 61 | inner_group_num (:obj:`int`, optional, defaults to 1): 62 | The number of inner repetition of attention and ffn. 63 | hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): 64 | The non-linear activation function (function or string) in the encoder and pooler. 65 | If string, "gelu", "relu", "swish" and "gelu_new" are supported. 66 | hidden_dropout_prob (:obj:`float`, optional, defaults to 0): 67 | The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. 68 | attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): 69 | The dropout ratio for the attention probabilities. 70 | max_position_embeddings (:obj:`int`, optional, defaults to 512): 71 | The maximum sequence length that this model might ever be used with. Typically set this to something 72 | large (e.g., 512 or 1024 or 2048). 73 | type_vocab_size (:obj:`int`, optional, defaults to 2): 74 | The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. 75 | initializer_range (:obj:`float`, optional, defaults to 0.02): 76 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 77 | layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): 78 | The epsilon used by the layer normalization layers. 79 | classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): 80 | The dropout ratio for attached classifiers. 81 | 82 | Example:: 83 | 84 | from transformers import AlbertConfig, AlbertModel 85 | # Initializing an ALBERT-xxlarge style configuration 86 | albert_xxlarge_configuration = AlbertConfig() 87 | 88 | # Initializing an ALBERT-base style configuration 89 | albert_base_configuration = AlbertConfig( 90 | hidden_size=768, 91 | num_attention_heads=12, 92 | intermediate_size=3072, 93 | ) 94 | 95 | # Initializing a model from the ALBERT-base style configuration 96 | model = AlbertModel(albert_xxlarge_configuration) 97 | 98 | # Accessing the model configuration 99 | configuration = model.config 100 | 101 | Attributes: 102 | pretrained_config_archive_map (Dict[str, str]): 103 | A dictionary containing all the available pre-trained checkpoints. 104 | """ 105 | 106 | pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 107 | model_type = "albert" 108 | 109 | def __init__( 110 | self, 111 | vocab_size=30000, 112 | embedding_size=128, 113 | hidden_size=4096, 114 | num_hidden_layers=12, 115 | num_hidden_groups=1, 116 | num_attention_heads=64, 117 | intermediate_size=16384, 118 | inner_group_num=1, 119 | hidden_act="gelu_new", 120 | hidden_dropout_prob=0, 121 | attention_probs_dropout_prob=0, 122 | max_position_embeddings=512, 123 | type_vocab_size=2, 124 | initializer_range=0.02, 125 | layer_norm_eps=1e-12, 126 | classifier_dropout_prob=0.1, 127 | **kwargs 128 | ): 129 | super().__init__(**kwargs) 130 | 131 | self.vocab_size = vocab_size 132 | self.embedding_size = embedding_size 133 | self.hidden_size = hidden_size 134 | self.num_hidden_layers = num_hidden_layers 135 | self.num_hidden_groups = num_hidden_groups 136 | self.num_attention_heads = num_attention_heads 137 | self.inner_group_num = inner_group_num 138 | self.hidden_act = hidden_act 139 | self.intermediate_size = intermediate_size 140 | self.hidden_dropout_prob = hidden_dropout_prob 141 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 142 | self.max_position_embeddings = max_position_embeddings 143 | self.type_vocab_size = type_vocab_size 144 | self.initializer_range = initializer_range 145 | self.layer_norm_eps = layer_norm_eps 146 | self.classifier_dropout_prob = classifier_dropout_prob 147 | -------------------------------------------------------------------------------- /src/transformers/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch optimization for BERT model.""" 16 | 17 | import logging 18 | import math 19 | 20 | import torch 21 | from torch.optim import Optimizer 22 | from torch.optim.lr_scheduler import LambdaLR 23 | 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | def get_constant_schedule(optimizer, last_epoch=-1): 29 | """ Create a schedule with a constant learning rate. 30 | """ 31 | return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch) 32 | 33 | 34 | def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1): 35 | """ Create a schedule with a constant learning rate preceded by a warmup 36 | period during which the learning rate increases linearly between 0 and 1. 37 | """ 38 | 39 | def lr_lambda(current_step): 40 | if current_step < num_warmup_steps: 41 | return float(current_step) / float(max(1.0, num_warmup_steps)) 42 | return 1.0 43 | 44 | return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch) 45 | 46 | 47 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): 48 | """ Create a schedule with a learning rate that decreases linearly after 49 | linearly increasing during a warmup period. 50 | """ 51 | 52 | def lr_lambda(current_step): 53 | if current_step < num_warmup_steps: 54 | return float(current_step) / float(max(1, num_warmup_steps)) 55 | return max( 56 | 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)) 57 | ) 58 | 59 | return LambdaLR(optimizer, lr_lambda, last_epoch) 60 | 61 | 62 | def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1): 63 | """ Create a schedule with a learning rate that decreases following the 64 | values of the cosine function between 0 and `pi * cycles` after a warmup 65 | period during which it increases linearly between 0 and 1. 66 | """ 67 | 68 | def lr_lambda(current_step): 69 | if current_step < num_warmup_steps: 70 | return float(current_step) / float(max(1, num_warmup_steps)) 71 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) 72 | return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) 73 | 74 | return LambdaLR(optimizer, lr_lambda, last_epoch) 75 | 76 | 77 | def get_cosine_with_hard_restarts_schedule_with_warmup( 78 | optimizer, num_warmup_steps, num_training_steps, num_cycles=1.0, last_epoch=-1 79 | ): 80 | """ Create a schedule with a learning rate that decreases following the 81 | values of the cosine function with several hard restarts, after a warmup 82 | period during which it increases linearly between 0 and 1. 83 | """ 84 | 85 | def lr_lambda(current_step): 86 | if current_step < num_warmup_steps: 87 | return float(current_step) / float(max(1, num_warmup_steps)) 88 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) 89 | if progress >= 1.0: 90 | return 0.0 91 | return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0)))) 92 | 93 | return LambdaLR(optimizer, lr_lambda, last_epoch) 94 | 95 | 96 | class AdamW(Optimizer): 97 | """ Implements Adam algorithm with weight decay fix. 98 | 99 | Parameters: 100 | lr (float): learning rate. Default 1e-3. 101 | betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999) 102 | eps (float): Adams epsilon. Default: 1e-6 103 | weight_decay (float): Weight decay. Default: 0.0 104 | correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True. 105 | """ 106 | 107 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True): 108 | if lr < 0.0: 109 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 110 | if not 0.0 <= betas[0] < 1.0: 111 | raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0])) 112 | if not 0.0 <= betas[1] < 1.0: 113 | raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1])) 114 | if not 0.0 <= eps: 115 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps)) 116 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias) 117 | super().__init__(params, defaults) 118 | 119 | def step(self, closure=None): 120 | """Performs a single optimization step. 121 | 122 | Arguments: 123 | closure (callable, optional): A closure that reevaluates the model 124 | and returns the loss. 125 | """ 126 | loss = None 127 | if closure is not None: 128 | loss = closure() 129 | 130 | for group in self.param_groups: 131 | for p in group["params"]: 132 | if p.grad is None: 133 | continue 134 | grad = p.grad.data 135 | if grad.is_sparse: 136 | raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead") 137 | 138 | state = self.state[p] 139 | 140 | # State initialization 141 | if len(state) == 0: 142 | state["step"] = 0 143 | # Exponential moving average of gradient values 144 | state["exp_avg"] = torch.zeros_like(p.data) 145 | # Exponential moving average of squared gradient values 146 | state["exp_avg_sq"] = torch.zeros_like(p.data) 147 | 148 | exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] 149 | beta1, beta2 = group["betas"] 150 | 151 | state["step"] += 1 152 | 153 | # Decay the first and second moment running average coefficient 154 | # In-place operations to update the averages at the same time 155 | exp_avg.mul_(beta1).add_(1.0 - beta1, grad) 156 | exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad) 157 | denom = exp_avg_sq.sqrt().add_(group["eps"]) 158 | 159 | step_size = group["lr"] 160 | if group["correct_bias"]: # No bias correction for Bert 161 | bias_correction1 = 1.0 - beta1 ** state["step"] 162 | bias_correction2 = 1.0 - beta2 ** state["step"] 163 | step_size = step_size * math.sqrt(bias_correction2) / bias_correction1 164 | 165 | p.data.addcdiv_(-step_size, exp_avg, denom) 166 | 167 | # Just adding the square of the weights to the loss function is *not* 168 | # the correct way of using L2 regularization/weight decay with Adam, 169 | # since that will interact with the m and v parameters in strange ways. 170 | # 171 | # Instead we want to decay the weights in a manner that doesn't interact 172 | # with the m/v parameters. This is equivalent to adding the square 173 | # of the weights to the loss with plain (non-momentum) SGD. 174 | # Add weight decay at the end (fixed version) 175 | if group["weight_decay"] > 0.0: 176 | p.data.add_(-group["lr"] * group["weight_decay"], p.data) 177 | 178 | return loss 179 | -------------------------------------------------------------------------------- /src/transformers/commands/serving.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from argparse import ArgumentParser, Namespace 3 | from typing import Any, List, Optional 4 | 5 | from transformers import Pipeline 6 | from transformers.commands import BaseTransformersCLICommand 7 | from transformers.pipelines import SUPPORTED_TASKS, pipeline 8 | 9 | 10 | try: 11 | from uvicorn import run 12 | from fastapi import FastAPI, HTTPException, Body 13 | from fastapi.routing import APIRoute 14 | from pydantic import BaseModel 15 | from starlette.responses import JSONResponse 16 | 17 | _serve_dependencies_installed = True 18 | except (ImportError, AttributeError): 19 | BaseModel = object 20 | 21 | def Body(*x, **y): 22 | pass 23 | 24 | _serve_dependencies_installed = False 25 | 26 | 27 | logger = logging.getLogger("transformers-cli/serving") 28 | 29 | 30 | def serve_command_factory(args: Namespace): 31 | """ 32 | Factory function used to instantiate serving server from provided command line arguments. 33 | :return: ServeCommand 34 | """ 35 | nlp = pipeline( 36 | task=args.task, 37 | model=args.model if args.model else None, 38 | config=args.config, 39 | tokenizer=args.tokenizer, 40 | device=args.device, 41 | ) 42 | return ServeCommand(nlp, args.host, args.port, args.workers) 43 | 44 | 45 | class ServeModelInfoResult(BaseModel): 46 | """ 47 | Expose model information 48 | """ 49 | 50 | infos: dict 51 | 52 | 53 | class ServeTokenizeResult(BaseModel): 54 | """ 55 | Tokenize result model 56 | """ 57 | 58 | tokens: List[str] 59 | tokens_ids: Optional[List[int]] 60 | 61 | 62 | class ServeDeTokenizeResult(BaseModel): 63 | """ 64 | DeTokenize result model 65 | """ 66 | 67 | text: str 68 | 69 | 70 | class ServeForwardResult(BaseModel): 71 | """ 72 | Forward result model 73 | """ 74 | 75 | output: Any 76 | 77 | 78 | class ServeCommand(BaseTransformersCLICommand): 79 | @staticmethod 80 | def register_subcommand(parser: ArgumentParser): 81 | """ 82 | Register this command to argparse so it's available for the transformer-cli 83 | :param parser: Root parser to register command-specific arguments 84 | :return: 85 | """ 86 | serve_parser = parser.add_parser( 87 | "serve", help="CLI tool to run inference requests through REST and GraphQL endpoints." 88 | ) 89 | serve_parser.add_argument( 90 | "--task", type=str, choices=SUPPORTED_TASKS.keys(), help="The task to run the pipeline on" 91 | ) 92 | serve_parser.add_argument("--host", type=str, default="localhost", help="Interface the server will listen on.") 93 | serve_parser.add_argument("--port", type=int, default=8888, help="Port the serving will listen to.") 94 | serve_parser.add_argument("--workers", type=int, default=1, help="Number of http workers") 95 | serve_parser.add_argument("--model", type=str, help="Model's name or path to stored model.") 96 | serve_parser.add_argument("--config", type=str, help="Model's config name or path to stored model.") 97 | serve_parser.add_argument("--tokenizer", type=str, help="Tokenizer name to use.") 98 | serve_parser.add_argument( 99 | "--device", 100 | type=int, 101 | default=-1, 102 | help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)", 103 | ) 104 | serve_parser.set_defaults(func=serve_command_factory) 105 | 106 | def __init__(self, pipeline: Pipeline, host: str, port: int, workers: int): 107 | 108 | self._pipeline = pipeline 109 | 110 | self.host = host 111 | self.port = port 112 | self.workers = workers 113 | 114 | if not _serve_dependencies_installed: 115 | raise RuntimeError( 116 | "Using serve command requires FastAPI and unicorn. " 117 | 'Please install transformers with [serving]: pip install "transformers[serving]".' 118 | "Or install FastAPI and unicorn separately." 119 | ) 120 | else: 121 | logger.info("Serving model over {}:{}".format(host, port)) 122 | self._app = FastAPI( 123 | routes=[ 124 | APIRoute( 125 | "/", 126 | self.model_info, 127 | response_model=ServeModelInfoResult, 128 | response_class=JSONResponse, 129 | methods=["GET"], 130 | ), 131 | APIRoute( 132 | "/tokenize", 133 | self.tokenize, 134 | response_model=ServeTokenizeResult, 135 | response_class=JSONResponse, 136 | methods=["POST"], 137 | ), 138 | APIRoute( 139 | "/detokenize", 140 | self.detokenize, 141 | response_model=ServeDeTokenizeResult, 142 | response_class=JSONResponse, 143 | methods=["POST"], 144 | ), 145 | APIRoute( 146 | "/forward", 147 | self.forward, 148 | response_model=ServeForwardResult, 149 | response_class=JSONResponse, 150 | methods=["POST"], 151 | ), 152 | ], 153 | timeout=600, 154 | ) 155 | 156 | def run(self): 157 | run(self._app, host=self.host, port=self.port, workers=self.workers) 158 | 159 | def model_info(self): 160 | return ServeModelInfoResult(infos=vars(self._pipeline.model.config)) 161 | 162 | def tokenize(self, text_input: str = Body(None, embed=True), return_ids: bool = Body(False, embed=True)): 163 | """ 164 | Tokenize the provided input and eventually returns corresponding tokens id: 165 | - **text_input**: String to tokenize 166 | - **return_ids**: Boolean flags indicating if the tokens have to be converted to their integer mapping. 167 | """ 168 | try: 169 | tokens_txt = self._pipeline.tokenizer.tokenize(text_input) 170 | 171 | if return_ids: 172 | tokens_ids = self._pipeline.tokenizer.convert_tokens_to_ids(tokens_txt) 173 | return ServeTokenizeResult(tokens=tokens_txt, tokens_ids=tokens_ids) 174 | else: 175 | return ServeTokenizeResult(tokens=tokens_txt) 176 | 177 | except Exception as e: 178 | raise HTTPException(status_code=500, detail={"model": "", "error": str(e)}) 179 | 180 | def detokenize( 181 | self, 182 | tokens_ids: List[int] = Body(None, embed=True), 183 | skip_special_tokens: bool = Body(False, embed=True), 184 | cleanup_tokenization_spaces: bool = Body(True, embed=True), 185 | ): 186 | """ 187 | Detokenize the provided tokens ids to readable text: 188 | - **tokens_ids**: List of tokens ids 189 | - **skip_special_tokens**: Flag indicating to not try to decode special tokens 190 | - **cleanup_tokenization_spaces**: Flag indicating to remove all leading/trailing spaces and intermediate ones. 191 | """ 192 | try: 193 | decoded_str = self._pipeline.tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces) 194 | return ServeDeTokenizeResult(model="", text=decoded_str) 195 | except Exception as e: 196 | raise HTTPException(status_code=500, detail={"model": "", "error": str(e)}) 197 | 198 | async def forward(self, inputs=Body(None, embed=True)): 199 | """ 200 | **inputs**: 201 | **attention_mask**: 202 | **tokens_type_ids**: 203 | """ 204 | 205 | # Check we don't have empty string 206 | if len(inputs) == 0: 207 | return ServeForwardResult(output=[], attention=[]) 208 | 209 | try: 210 | # Forward through the model 211 | output = self._pipeline(inputs) 212 | return ServeForwardResult(output=output) 213 | except Exception as e: 214 | raise HTTPException(500, {"error": str(e)}) 215 | -------------------------------------------------------------------------------- /src/transformers/modeling_tf_transfo_xl_utilities.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ A TF 2.0 Adaptive Softmax for Transformer XL model. 17 | """ 18 | 19 | 20 | import tensorflow as tf 21 | 22 | from .modeling_tf_utils import shape_list 23 | 24 | 25 | class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): 26 | def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs): 27 | super().__init__(**kwargs) 28 | 29 | self.vocab_size = vocab_size 30 | self.d_embed = d_embed 31 | self.d_proj = d_proj 32 | 33 | self.cutoffs = cutoffs + [vocab_size] 34 | self.cutoff_ends = [0] + self.cutoffs 35 | self.div_val = div_val 36 | 37 | self.shortlist_size = self.cutoffs[0] 38 | self.n_clusters = len(self.cutoffs) - 1 39 | self.head_size = self.shortlist_size + self.n_clusters 40 | self.keep_order = keep_order 41 | 42 | self.out_layers = [] 43 | self.out_projs = [] 44 | 45 | def build(self, input_shape): 46 | if self.n_clusters > 0: 47 | self.cluster_weight = self.add_weight( 48 | shape=(self.n_clusters, self.d_embed), initializer="zeros", trainable=True, name="cluster_weight" 49 | ) 50 | self.cluster_bias = self.add_weight( 51 | shape=(self.n_clusters,), initializer="zeros", trainable=True, name="cluster_bias" 52 | ) 53 | 54 | if self.div_val == 1: 55 | for i in range(len(self.cutoffs)): 56 | if self.d_proj != self.d_embed: 57 | weight = self.add_weight( 58 | shape=(self.d_embed, self.d_proj), 59 | initializer="zeros", 60 | trainable=True, 61 | name="out_projs_._{}".format(i), 62 | ) 63 | self.out_projs.append(weight) 64 | else: 65 | self.out_projs.append(None) 66 | weight = self.add_weight( 67 | shape=(self.vocab_size, self.d_embed,), 68 | initializer="zeros", 69 | trainable=True, 70 | name="out_layers_._{}_._weight".format(i), 71 | ) 72 | bias = self.add_weight( 73 | shape=(self.vocab_size,), 74 | initializer="zeros", 75 | trainable=True, 76 | name="out_layers_._{}_._bias".format(i), 77 | ) 78 | self.out_layers.append((weight, bias)) 79 | else: 80 | for i in range(len(self.cutoffs)): 81 | l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] 82 | d_emb_i = self.d_embed // (self.div_val ** i) 83 | 84 | weight = self.add_weight( 85 | shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name="out_projs_._{}".format(i) 86 | ) 87 | self.out_projs.append(weight) 88 | weight = self.add_weight( 89 | shape=(r_idx - l_idx, d_emb_i,), 90 | initializer="zeros", 91 | trainable=True, 92 | name="out_layers_._{}_._weight".format(i), 93 | ) 94 | bias = self.add_weight( 95 | shape=(r_idx - l_idx,), 96 | initializer="zeros", 97 | trainable=True, 98 | name="out_layers_._{}_._bias".format(i), 99 | ) 100 | self.out_layers.append((weight, bias)) 101 | super().build(input_shape) 102 | 103 | @staticmethod 104 | def _logit(x, W, b, proj=None): 105 | y = x 106 | if proj is not None: 107 | y = tf.einsum("ibd,ed->ibe", y, proj) 108 | return tf.einsum("ibd,nd->ibn", y, W) + b 109 | 110 | @staticmethod 111 | def _gather_logprob(logprob, target): 112 | lp_size = shape_list(logprob) 113 | r = tf.range(lp_size[0]) 114 | idx = tf.stack([r, target], 1) 115 | return tf.gather_nd(logprob, idx) 116 | 117 | def call(self, inputs, return_mean=True, training=False): 118 | hidden, target = inputs 119 | head_logprob = 0 120 | if self.n_clusters == 0: 121 | output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0]) 122 | if target is not None: 123 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output) 124 | out = tf.nn.log_softmax(output, axis=-1) 125 | else: 126 | hidden_sizes = shape_list(hidden) 127 | out = [] 128 | loss = tf.zeros(hidden_sizes[:2], dtype=tf.float32) 129 | for i in range(len(self.cutoffs)): 130 | l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] 131 | if target is not None: 132 | mask = (target >= l_idx) & (target < r_idx) 133 | mask_idx = tf.where(mask) 134 | cur_target = tf.boolean_mask(target, mask) - l_idx 135 | 136 | if self.div_val == 1: 137 | cur_W = self.out_layers[0][0][l_idx:r_idx] 138 | cur_b = self.out_layers[0][1][l_idx:r_idx] 139 | else: 140 | cur_W = self.out_layers[i][0] 141 | cur_b = self.out_layers[i][1] 142 | 143 | if i == 0: 144 | cur_W = tf.concat([cur_W, self.cluster_weight], 0) 145 | cur_b = tf.concat([cur_b, self.cluster_bias], 0) 146 | 147 | head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0]) 148 | head_logprob = tf.nn.log_softmax(head_logit) 149 | out.append(head_logprob[..., : self.cutoffs[0]]) 150 | if target is not None: 151 | cur_head_logprob = tf.boolean_mask(head_logprob, mask) 152 | cur_logprob = self._gather_logprob(cur_head_logprob, cur_target) 153 | else: 154 | tail_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[i]) 155 | tail_logprob = tf.nn.log_softmax(tail_logit) 156 | cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster 157 | logprob_i = head_logprob[..., cluster_prob_idx, None] + tail_logprob 158 | out.append(logprob_i) 159 | if target is not None: 160 | cur_head_logprob = tf.boolean_mask(head_logprob, mask) 161 | cur_tail_logprob = tf.boolean_mask(tail_logprob, mask) 162 | cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target) 163 | cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1] 164 | if target is not None: 165 | loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(shape_list(loss), dtype=tf.int64)) 166 | out = tf.concat(out, axis=-1) 167 | 168 | if target is not None: 169 | if return_mean: 170 | loss = tf.reduce_mean(loss) 171 | # Add the training-time loss value to the layer using `self.add_loss()`. 172 | self.add_loss(loss) 173 | 174 | # Log the loss as a metric (we could log arbitrary metrics, 175 | # including different metrics for training and inference. 176 | self.add_metric(loss, name=self.name, aggregation="mean" if return_mean else "") 177 | 178 | return out 179 | -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source init.sh 4 | 5 | MODEL_NAME_OR_PATH='xlm-roberta-large' 6 | 7 | usage() 8 | { 9 | cat << EOF 10 | usage: $0 options 11 | OPTIONS: 12 | -h Show the help and exit 13 | -n Experiment name for saving to output directory 14 | -m Pretrained model name or path 15 | -g gpus to use, default is to use all GPUs 16 | -t task to train 17 | -x For convinent usage 18 | EOF 19 | } 20 | 21 | while getopts "h:m:n:g:t:x:" opt 22 | do 23 | case $opt in 24 | h) 25 | usage 26 | exit 1 27 | ;; 28 | n) 29 | EXP_NAME=$OPTARG 30 | ;; 31 | m) 32 | MODEL_NAME_OR_PATH=${OPTARG} 33 | ;; 34 | g) 35 | N_GPU=$OPTARG 36 | ;; 37 | t) 38 | TASK=$OPTARG 39 | ;; 40 | x) 41 | OTHER_ARGS=$OPTARG 42 | ;; 43 | esac 44 | done 45 | 46 | DATA_DIR=$DATA_ROOT/data_raw 47 | if [[ ! -d $DATA_DIR ]]; then 48 | echo "$DATA_DIR not exist" 49 | exit 1 50 | fi 51 | 52 | 53 | OUTPUT_DIR=$DATA_ROOT/outputs/${EXP_NAME:-debug} 54 | mkdir -p $OUTPUT_DIR 55 | 56 | xnli() { 57 | python -m torch.distributed.launch --nproc_per_node=$N_GPU --master_port=$RANDOM ./examples/run_xcls.py \ 58 | --task_name xnli \ 59 | --data_dir $DATA_DIR/xnli \ 60 | --model_type filter \ 61 | --model_name_or_path $MODEL_NAME_OR_PATH \ 62 | --language ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh \ 63 | --train_language en \ 64 | --do_train \ 65 | --eval_splits valid \ 66 | --fp16 \ 67 | --per_gpu_train_batch_size 8 \ 68 | --learning_rate 3e-6 \ 69 | --num_train_epochs 5 \ 70 | --max_seq_length 256 \ 71 | --output_dir $OUTPUT_DIR \ 72 | --log_dir $OUTPUT_DIR \ 73 | --overwrite_output_dir \ 74 | --logging_steps 500 \ 75 | --logging_each_epoch \ 76 | --per_gpu_eval_batch_size 64 \ 77 | --eval_all_checkpoints \ 78 | --filter_m 1 --filter_k 1 \ 79 | ${OTHER_ARGS} 80 | } 81 | 82 | pawsx() { 83 | python -m torch.distributed.launch --nproc_per_node=$N_GPU --master_port=$RANDOM ./examples/run_xcls.py \ 84 | --task_name pawsx \ 85 | --data_dir $DATA_DIR/pawsx \ 86 | --model_type filter \ 87 | --language de,en,es,fr,ja,ko,zh \ 88 | --model_name_or_path $MODEL_NAME_OR_PATH \ 89 | --train_language en \ 90 | --do_train \ 91 | --eval_splits valid \ 92 | --per_gpu_train_batch_size 4 \ 93 | --learning_rate 1e-5 \ 94 | --num_train_epochs 4 \ 95 | --max_seq_length 256 \ 96 | --output_dir $OUTPUT_DIR \ 97 | --log_dir $OUTPUT_DIR \ 98 | --overwrite_output_dir \ 99 | --logging_steps 500 \ 100 | --per_gpu_eval_batch_size 64 \ 101 | --logging_each_epoch \ 102 | --filter_m 1 --filter_k 1 \ 103 | ${OTHER_ARGS} 104 | } 105 | 106 | mlqa() { 107 | # mlqa and xquad share the same training set 108 | python -m torch.distributed.launch --nproc_per_node=$N_GPU --master_port=$RANDOM ./examples/run_xqa.py \ 109 | --task_name mlqa \ 110 | --data_dir $DATA_DIR \ 111 | --model_type filter \ 112 | --model_name_or_path $MODEL_NAME_OR_PATH \ 113 | --language ar,de,en,es,hi,vi,zh \ 114 | --train_language en \ 115 | --do_train \ 116 | --eval_splits 'dev' \ 117 | --do_lower_case \ 118 | --per_gpu_train_batch_size 4 \ 119 | --gradient_accumulation_steps 2 \ 120 | --learning_rate 5e-6 \ 121 | --per_gpu_eval_batch_size 64 \ 122 | --num_train_epochs 2.0 \ 123 | --max_seq_length 384 \ 124 | --doc_stride 128 \ 125 | --output_dir $OUTPUT_DIR \ 126 | --log_dir $OUTPUT_DIR \ 127 | --logging_each_epoch \ 128 | --evaluate_during_training \ 129 | --threads 8 \ 130 | --filter_m 1 --filter_k 20 \ 131 | ${OTHER_ARGS} 132 | } 133 | 134 | 135 | xquad() { 136 | python -m torch.distributed.launch --nproc_per_node=$N_GPU --master_port=$RANDOM ./examples/run_xqa.py \ 137 | --task_name xquad \ 138 | --data_dir $DATA_DIR/ \ 139 | --model_type filter \ 140 | --model_name_or_path $MODEL_NAME_OR_PATH \ 141 | --language ar,de,el,en,es,hi,ru,th,tr,vi,zh \ 142 | --train_language en \ 143 | --do_train \ 144 | --eval_splits 'dev' \ 145 | --do_lower_case \ 146 | --per_gpu_train_batch_size 4 \ 147 | --learning_rate 5e-6 \ 148 | --per_gpu_eval_batch_size 64 \ 149 | --num_train_epochs 2.0 \ 150 | --max_seq_length 384 \ 151 | --doc_stride 128 \ 152 | --output_dir $OUTPUT_DIR \ 153 | --log_dir $OUTPUT_DIR \ 154 | --logging_each_epoch \ 155 | --eval_all_checkpoints \ 156 | --threads 8 \ 157 | --filter_m 1 --filter_k 20 \ 158 | ${OTHER_ARGS} 159 | } 160 | 161 | tydiqa() { 162 | python -m torch.distributed.launch --nproc_per_node=$N_GPU --master_port=$RANDOM ./examples/run_xqa.py \ 163 | --task_name tydiqa \ 164 | --data_dir $DATA_DIR \ 165 | --model_type filter \ 166 | --model_name_or_path $MODEL_NAME_OR_PATH \ 167 | --language ar,bn,en,fi,id,ko,ru,sw,te \ 168 | --train_language en \ 169 | --do_train \ 170 | --do_lower_case \ 171 | --eval_splits dev \ 172 | --per_gpu_train_batch_size 4 \ 173 | --learning_rate 1e-5 \ 174 | --per_gpu_eval_batch_size 64 \ 175 | --num_train_epochs 4.0 \ 176 | --logging_each_epoch \ 177 | --max_seq_length 384 \ 178 | --doc_stride 128 \ 179 | --output_dir $OUTPUT_DIR \ 180 | --log_dir $OUTPUT_DIR \ 181 | --overwrite_output_dir \ 182 | --eval_all_checkpoints \ 183 | --threads 8 \ 184 | --filter_m 1 --filter_k 20 \ 185 | ${OTHER_ARGS} 186 | } 187 | 188 | udpos() { 189 | python -m torch.distributed.launch --nproc_per_node=$N_GPU --master_port=$RANDOM ./examples/run_xtreme_tag.py \ 190 | --task_name udpos \ 191 | --data_dir $DATA_ROOT/udpos/udpos_processed_maxlen128 \ 192 | --model_type filter \ 193 | --model_name_or_path $MODEL_NAME_OR_PATH \ 194 | --labels $DATA_ROOT/udpos/udpos_processed_maxlen128/labels.txt \ 195 | --language af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh \ 196 | --train_language en \ 197 | --do_train \ 198 | --eval_splits dev \ 199 | --max_seq_length 128 \ 200 | --num_train_epochs 20 \ 201 | --per_gpu_train_batch_size 8 \ 202 | --per_gpu_eval_batch_size 64 \ 203 | --learning_rate 5e-6 \ 204 | --save_steps 1000 \ 205 | --output_dir $OUTPUT_DIR \ 206 | --log_dir $OUTPUT_DIR \ 207 | --eval_all_checkpoints \ 208 | --filter_m 1 --filter_k 1 \ 209 | ${OTHER_ARGS} 210 | } 211 | 212 | panx() { 213 | python -m torch.distributed.launch --nproc_per_node=${N_GPU:-8} --master_port=$RANDOM ./examples/run_tag.py \ 214 | --task_name panx \ 215 | --data_dir $DATA_ROOT/panx/panx_processed_maxlen128 \ 216 | --labels $DATA_ROOT/panx/panx_processed_maxlen128/labels.txt \ 217 | --model_type filter \ 218 | --model_name_or_path $MODEL_NAME_OR_PATH \ 219 | --language ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu \ 220 | --train_language en \ 221 | --do_train \ 222 | --eval_splits dev \ 223 | --max_seq_length 128 \ 224 | --num_train_epochs 20 \ 225 | --per_gpu_train_batch_size 8 \ 226 | --per_gpu_eval_batch_size 64 \ 227 | --learning_rate 5e-6 \ 228 | --save_steps 1000 \ 229 | --eval_all_checkpoints \ 230 | --log_dir $OUTPUT_DIR \ 231 | --output_dir $OUTPUT_DIR \ 232 | --filter_m 1 --filter_k 1 \ 233 | ${OTHER_ARGS} 234 | } 235 | 236 | for task in xnli pawsx mlqa xquad tydiqa udpos panx 237 | do 238 | if [[ ${TASK:-"xnli"} == $task ]]; then 239 | $task 240 | fi 241 | done 242 | --------------------------------------------------------------------------------