├── transformers ├── file.py ├── benchmark │ ├── __init__.py │ ├── benchmark_args.py │ ├── benchmark_args_tf.py │ └── benchmark_args_utils.py ├── .modeling_bert.py.swp ├── .vscode │ └── settings.json ├── commands │ ├── __init__.py │ ├── transformers_cli.py │ ├── download.py │ ├── env.py │ ├── run.py │ └── train.py ├── data │ ├── datasets │ │ ├── __init__.py │ │ ├── dcn_language_modeling.py │ │ ├── language_modeling.py │ │ └── glue.py │ ├── processors │ │ ├── __init__.py │ │ └── xnli.py │ ├── __init__.py │ └── metrics │ │ └── __init__.py ├── another_try.py ├── try.py ├── configuration_marian.py ├── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py ├── filep.py ├── trainer_utils.py ├── configuration_camembert.py ├── configuration_mmbt.py ├── activations.py ├── convert_mobilebert_original_tf_checkpoint_to_pytorch.py ├── utils_encoder_decoder.py ├── configuration_xlm_roberta.py ├── convert_t5_original_tf_checkpoint_to_pytorch.py ├── convert_bert_original_tf_checkpoint_to_pytorch.py ├── convert_albert_original_tf_checkpoint_to_pytorch.py ├── modeling_marian.py ├── tokenization_longformer.py ├── training_args_tf.py ├── tokenization_mobilebert.py ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py ├── tokenization_retribert.py ├── convert_openai_original_tf_checkpoint_to_pytorch.py ├── convert_electra_original_tf_checkpoint_to_pytorch.py ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py ├── convert_longformer_original_pytorch_lightning_to_pytorch.py ├── configuration_longformer.py ├── configuration_roberta.py ├── tokenization_electra.py ├── tokenization_distilbert.py ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py ├── convert_bert_pytorch_checkpoint_to_original_tf.py ├── configuration_encoder_decoder.py ├── configuration_t5.py ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py ├── modeling_camembert.py ├── configuration_retribert.py ├── modeling_xlm_roberta.py ├── configuration_ctrl.py ├── tokenization_flaubert.py ├── configuration_bart.py ├── modeling_tf_camembert.py ├── modeling_tf_xlm_roberta.py ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py └── configuration_distilbert.py ├── requirements.txt ├── predict.sh ├── config.json ├── chinese_roberta_wwm_ext_pytorch └── config.json ├── train.sh ├── README.md ├── .gitignore └── vocab └── pinyin_vocab.txt /transformers/file.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /transformers/benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.5.0 2 | transformers==3.0.0 3 | -------------------------------------------------------------------------------- /transformers/.modeling_bert.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/destwang/DCN/HEAD/transformers/.modeling_bert.py.swp -------------------------------------------------------------------------------- /transformers/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.formatting.provider": "yapf", 3 | "C_Cpp.dimInactiveRegions": false 4 | } -------------------------------------------------------------------------------- /predict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -v 4 | set -e 5 | 6 | 7 | INPUT_FILE=data/sighan15/TestInput.txt 8 | OUTPUT_FILE=output.txt 9 | MODEL_DIR=dcn_models/ 10 | MAX_LENGTH=130 11 | BATCH_SIZE=4 12 | 13 | python predict_DCN.py \ 14 | --model $MODEL_DIR \ 15 | --input_file $INPUT_FILE \ 16 | --output_file $OUTPUT_FILE \ 17 | --batch_size $BATCH_SIZE \ 18 | --max_len $MAX_LENGTH 19 | -------------------------------------------------------------------------------- /transformers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from argparse import ArgumentParser 3 | 4 | 5 | class BaseTransformersCLICommand(ABC): 6 | @staticmethod 7 | @abstractmethod 8 | def register_subcommand(parser: ArgumentParser): 9 | raise NotImplementedError() 10 | 11 | @abstractmethod 12 | def run(self): 13 | raise NotImplementedError() 14 | -------------------------------------------------------------------------------- /transformers/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .glue import GlueDataset, GlueDataTrainingArguments 6 | from .language_modeling import LineByLineTextDataset, TextDataset 7 | from .dcn_language_modeling import PinyinShuffleLineByLineTextDataset 8 | -------------------------------------------------------------------------------- /transformers/another_try.py: -------------------------------------------------------------------------------- 1 | from transformers import TFBertModel, BertTokenizer, BertConfig 2 | import tensorflow as tf 3 | 4 | config = BertConfig.from_pretrained("bert-base-cased", output_hidden_states=True) 5 | model = TFBertModel.from_pretrained("bert-base-cased", config=config) 6 | 7 | tok = BertTokenizer.from_pretrained("bert-base-cased") 8 | text = tok.encode("Ain't this [MASK] best thing you've ever seen?") 9 | 10 | inputs = tf.constant(text) 11 | outputs = model.predict(inputs) 12 | 13 | print(outputs) -------------------------------------------------------------------------------- /transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels 6 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features 7 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor 8 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels 9 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "topk": 8, 3 | "pinyin_vocab_size": 404, 4 | "pinyin_mapping_path": "vocab/pinyin_mapping.txt", 5 | "attention_probs_dropout_prob": 0.1, 6 | "directionality": "bidi", 7 | "hidden_act": "gelu", 8 | "hidden_dropout_prob": 0.1, 9 | "hidden_size": 768, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 3072, 12 | "max_position_embeddings": 512, 13 | "num_attention_heads": 12, 14 | "num_hidden_layers": 12, 15 | "pooler_fc_size": 768, 16 | "pooler_num_attention_heads": 12, 17 | "pooler_num_fc_layers": 3, 18 | "pooler_size_per_head": 128, 19 | "pooler_type": "first_token_transform", 20 | "type_vocab_size": 2, 21 | "vocab_size": 21128 22 | } 23 | -------------------------------------------------------------------------------- /chinese_roberta_wwm_ext_pytorch/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "topk": 8, 3 | "pinyin_vocab_size": 404, 4 | "pinyin_mapping_path": "vocab/pinyin_mapping.txt", 5 | "attention_probs_dropout_prob": 0.1, 6 | "directionality": "bidi", 7 | "hidden_act": "gelu", 8 | "hidden_dropout_prob": 0.1, 9 | "hidden_size": 768, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 3072, 12 | "max_position_embeddings": 512, 13 | "num_attention_heads": 12, 14 | "num_hidden_layers": 12, 15 | "pooler_fc_size": 768, 16 | "pooler_num_attention_heads": 12, 17 | "pooler_num_fc_layers": 3, 18 | "pooler_size_per_head": 128, 19 | "pooler_type": "first_token_transform", 20 | "type_vocab_size": 2, 21 | "vocab_size": 21128 22 | } 23 | -------------------------------------------------------------------------------- /transformers/try.py: -------------------------------------------------------------------------------- 1 | from transformers import TFAlbertForMaskedLM, TFAlbertModel, TFAlbertForSequenceClassification, AlbertForMaskedLM 2 | import os 3 | 4 | checkpoint = "albert-base-v1" 5 | 6 | model = AlbertForMaskedLM.from_pretrained(checkpoint) 7 | 8 | if not os.path.exists("~/saved/" + checkpoint): 9 | os.makedirs("~/saved/" + checkpoint) 10 | 11 | 12 | model.save_pretrained("~/saved/" + checkpoint) 13 | model = TFAlbertForMaskedLM.from_pretrained('~/saved/' + checkpoint, from_pt=True) 14 | model.save_pretrained("~/saved/" + checkpoint) 15 | model = TFAlbertModel.from_pretrained('~/saved/' + checkpoint) 16 | model = TFAlbertForMaskedLM.from_pretrained('~/saved/' + checkpoint) 17 | model = TFAlbertForSequenceClassification.from_pretrained('~/saved/' + checkpoint) 18 | 19 | 20 | print("nice model") -------------------------------------------------------------------------------- /transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .metrics import is_sklearn_available 6 | from .processors import ( 7 | DataProcessor, 8 | InputExample, 9 | InputFeatures, 10 | SingleSentenceClassificationProcessor, 11 | SquadExample, 12 | SquadFeatures, 13 | SquadV1Processor, 14 | SquadV2Processor, 15 | glue_convert_examples_to_features, 16 | glue_output_modes, 17 | glue_processors, 18 | glue_tasks_num_labels, 19 | squad_convert_examples_to_features, 20 | xnli_output_modes, 21 | xnli_processors, 22 | xnli_tasks_num_labels, 23 | ) 24 | 25 | 26 | if is_sklearn_available(): 27 | from .metrics import glue_compute_metrics, xnli_compute_metrics 28 | -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -v 4 | set -e 5 | 6 | 7 | TRAIN_FILE=data/train.txt 8 | TEST_FILE=data/sighan15/test_format.txt 9 | BERT_MODEL=chinese_roberta_wwm_ext_pytorch/ 10 | OUTPUT_DIR=dcn_models/ 11 | SAVE_STEPS=8794 12 | SEED=1038 13 | LR=5e-5 14 | SAVE_TOTAL_LIMIT=5 15 | MAX_LENGTH=130 16 | BATCH_SIZE=32 17 | NUM_EPOCHS=10 18 | 19 | python train_DCN.py \ 20 | --output_dir $OUTPUT_DIR \ 21 | --learning_rate $LR \ 22 | --per_gpu_train_batch_size $BATCH_SIZE \ 23 | --model_type=bert \ 24 | --model_name_or_path=$BERT_MODEL \ 25 | --num_train_epochs $NUM_EPOCHS \ 26 | --save_steps $SAVE_STEPS \ 27 | --logging_steps $SAVE_STEPS \ 28 | --save_total_limit $SAVE_TOTAL_LIMIT \ 29 | --block_size $MAX_LENGTH \ 30 | --train_data_file=$TRAIN_FILE \ 31 | --eval_data_file=$TEST_FILE \ 32 | --do_train \ 33 | --do_eval \ 34 | --do_predict \ 35 | --evaluate_during_training \ 36 | --seed $SEED \ 37 | --mlm \ 38 | --mlm_probability 0.15 39 | -------------------------------------------------------------------------------- /transformers/configuration_marian.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The OPUS-NMT Team, Marian team, and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Marian model configuration """ 16 | 17 | from .configuration_bart import BartConfig 18 | 19 | 20 | PRETRAINED_CONFIG_ARCHIVE_MAP = { 21 | "Helsinki-NLP/opus-mt-en-de": "https://s3.amazonaws.com/models.huggingface.co/bert/Helsinki-NLP/opus-mt-en-de/config.json", 22 | } 23 | 24 | 25 | class MarianConfig(BartConfig): 26 | model_type = "marian" 27 | -------------------------------------------------------------------------------- /transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | 6 | from transformers.file_utils import WEIGHTS_NAME 7 | 8 | 9 | DIALOGPT_MODELS = ["small", "medium", "large"] 10 | 11 | OLD_KEY = "lm_head.decoder.weight" 12 | NEW_KEY = "lm_head.weight" 13 | 14 | 15 | def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str): 16 | d = torch.load(checkpoint_path) 17 | d[NEW_KEY] = d.pop(OLD_KEY) 18 | os.makedirs(pytorch_dump_folder_path, exist_ok=True) 19 | torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--dialogpt_path", default=".", type=str) 25 | args = parser.parse_args() 26 | for MODEL in DIALOGPT_MODELS: 27 | checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl") 28 | pytorch_dump_folder_path = f"./DialoGPT-{MODEL}" 29 | convert_dialogpt_checkpoint( 30 | checkpoint_path, pytorch_dump_folder_path, 31 | ) 32 | -------------------------------------------------------------------------------- /transformers/filep.py: -------------------------------------------------------------------------------- 1 | from transformers import GPT2LMHeadModel, GPT2Tokenizer 2 | import torch 3 | 4 | tokenizer = GPT2Tokenizer.from_pretrained("gpt2") 5 | model = GPT2LMHeadModel.from_pretrained('gpt2') 6 | 7 | generated = tokenizer.encode("The Manhattan bridge") 8 | context = torch.tensor([generated]) 9 | past = None 10 | 11 | for i in range(15): 12 | output, past = model(context, past=past) 13 | 14 | distribution = output[0, :] 15 | 16 | # Get the top 10 values' indices and cast them to a list 17 | top_values = distribution[-1].topk(10).indices.tolist() 18 | 19 | # Decode those into words 20 | top_words = [tokenizer.decode([x]) for x in top_values.indices.tolist()] 21 | 22 | # select words (only arbitrarily select the first three) 23 | words = words[0:3] 24 | 25 | # Cast them back to tokens which can be used as an added token 26 | selected_tokens = [tokenizer.encode(word) for word in words] 27 | 28 | generated += [argmax_token.tolist()] 29 | context = argmax_token.unsqueeze(0) 30 | 31 | print(tokenizer.decode([argmax_token.tolist()])) 32 | 33 | sequence = tokenizer.decode(generated) 34 | 35 | print(sequence) -------------------------------------------------------------------------------- /transformers/trainer_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, NamedTuple, Optional 3 | 4 | import numpy as np 5 | 6 | 7 | try: 8 | import wandb 9 | 10 | wandb.ensure_configured() 11 | if wandb.api.api_key is None: 12 | _has_wandb = False 13 | wandb.termwarn("W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.") 14 | else: 15 | _has_wandb = False if os.getenv("WANDB_DISABLED") else True 16 | except (ImportError, AttributeError): 17 | _has_wandb = False 18 | 19 | 20 | def is_wandb_available(): 21 | return _has_wandb 22 | 23 | 24 | class EvalPrediction(NamedTuple): 25 | """ 26 | Evaluation output (always contains labels), to be used 27 | to compute metrics. 28 | """ 29 | 30 | #predictions: Union[list, np.ndarray] 31 | #label_ids: Union[list, np.ndarray] 32 | predictions: np.ndarray 33 | label_ids: np.ndarray 34 | pass 35 | 36 | 37 | class PredictionOutput(NamedTuple): 38 | predictions: np.ndarray 39 | label_ids: Optional[np.ndarray] 40 | metrics: Optional[Dict[str, float]] 41 | 42 | 43 | class TrainOutput(NamedTuple): 44 | global_step: int 45 | training_loss: float 46 | 47 | 48 | PREFIX_CHECKPOINT_DIR = "checkpoint" 49 | -------------------------------------------------------------------------------- /transformers/commands/transformers_cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from argparse import ArgumentParser 3 | 4 | from transformers.commands.convert import ConvertCommand 5 | from transformers.commands.download import DownloadCommand 6 | from transformers.commands.env import EnvironmentCommand 7 | from transformers.commands.run import RunCommand 8 | from transformers.commands.serving import ServeCommand 9 | from transformers.commands.user import UserCommands 10 | 11 | 12 | def main(): 13 | parser = ArgumentParser("Transformers CLI tool", usage="transformers-cli []") 14 | commands_parser = parser.add_subparsers(help="transformers-cli command helpers") 15 | 16 | # Register commands 17 | ConvertCommand.register_subcommand(commands_parser) 18 | DownloadCommand.register_subcommand(commands_parser) 19 | EnvironmentCommand.register_subcommand(commands_parser) 20 | RunCommand.register_subcommand(commands_parser) 21 | ServeCommand.register_subcommand(commands_parser) 22 | UserCommands.register_subcommand(commands_parser) 23 | 24 | # Let's go 25 | args = parser.parse_args() 26 | 27 | if not hasattr(args, "func"): 28 | parser.print_help() 29 | exit(1) 30 | 31 | # Run 32 | service = args.func(args) 33 | service.run() 34 | 35 | 36 | if __name__ == "__main__": 37 | main() 38 | -------------------------------------------------------------------------------- /transformers/commands/download.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from transformers.commands import BaseTransformersCLICommand 4 | 5 | 6 | def download_command_factory(args): 7 | return DownloadCommand(args.model, args.cache_dir, args.force) 8 | 9 | 10 | class DownloadCommand(BaseTransformersCLICommand): 11 | @staticmethod 12 | def register_subcommand(parser: ArgumentParser): 13 | download_parser = parser.add_parser("download") 14 | download_parser.add_argument( 15 | "--cache-dir", type=str, default=None, help="Path to location to store the models" 16 | ) 17 | download_parser.add_argument( 18 | "--force", action="store_true", help="Force the model to be download even if already in cache-dir" 19 | ) 20 | download_parser.add_argument("model", type=str, help="Name of the model to download") 21 | download_parser.set_defaults(func=download_command_factory) 22 | 23 | def __init__(self, model: str, cache: str, force: bool): 24 | self._model = model 25 | self._cache = cache 26 | self._force = force 27 | 28 | def run(self): 29 | from transformers import AutoModel, AutoTokenizer 30 | 31 | AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 32 | AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 33 | -------------------------------------------------------------------------------- /transformers/configuration_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ CamemBERT configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_roberta import RobertaConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json", 28 | "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json", 29 | "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json", 30 | } 31 | 32 | 33 | class CamembertConfig(RobertaConfig): 34 | """ 35 | This class overrides :class:`~transformers.RobertaConfig`. Please check the 36 | superclass for the appropriate documentation alongside usage examples. 37 | """ 38 | 39 | model_type = "camembert" 40 | -------------------------------------------------------------------------------- /transformers/configuration_mmbt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Copyright (c) HuggingFace Inc. team. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ MMBT configuration """ 17 | 18 | 19 | import logging 20 | 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class MMBTConfig(object): 26 | """Configuration class to store the configuration of a `MMBT Model`. 27 | 28 | Args: 29 | config (:obj:`~transformers.PreTrainedConfig`): 30 | Config of the underlying Transformer models. Its values are 31 | copied over to use a single config. 32 | num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`): 33 | Size of final Linear layer for classification. 34 | modal_hidden_size (:obj:`int`, optional, defautls to 2048): 35 | Embedding dimension of the non-text modality encoder. 36 | """ 37 | 38 | def __init__(self, config, num_labels=None, modal_hidden_size=2048): 39 | self.__dict__ = config.__dict__ 40 | self.modal_hidden_size = modal_hidden_size 41 | if num_labels: 42 | self.num_labels = num_labels 43 | -------------------------------------------------------------------------------- /transformers/activations.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def swish(x): 12 | return x * torch.sigmoid(x) 13 | 14 | 15 | def _gelu_python(x): 16 | """ Original Implementation of the gelu activation function in Google Bert repo when initially created. 17 | For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 18 | 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 19 | This is now written in C in torch.nn.functional 20 | Also see https://arxiv.org/abs/1606.08415 21 | """ 22 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 23 | 24 | 25 | def gelu_new(x): 26 | """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). 27 | Also see https://arxiv.org/abs/1606.08415 28 | """ 29 | return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) 30 | 31 | 32 | if torch.__version__ < "1.4.0": 33 | gelu = _gelu_python 34 | else: 35 | gelu = F.gelu 36 | 37 | 38 | def gelu_fast(x): 39 | return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) 40 | 41 | 42 | ACT2FN = { 43 | "relu": F.relu, 44 | "swish": swish, 45 | "gelu": gelu, 46 | "tanh": torch.tanh, 47 | "gelu_new": gelu_new, 48 | "gelu_fast": gelu_fast, 49 | } 50 | 51 | 52 | def get_activation(activation_string): 53 | if activation_string in ACT2FN: 54 | return ACT2FN[activation_string] 55 | else: 56 | raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys()))) 57 | -------------------------------------------------------------------------------- /transformers/convert_mobilebert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | 4 | import torch 5 | 6 | from transformers import MobileBertConfig, MobileBertForPreTraining, load_tf_weights_in_mobilebert 7 | 8 | 9 | logging.basicConfig(level=logging.INFO) 10 | 11 | 12 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, mobilebert_config_file, pytorch_dump_path): 13 | # Initialise PyTorch model 14 | config = MobileBertConfig.from_json_file(mobilebert_config_file) 15 | print("Building PyTorch model from configuration: {}".format(str(config))) 16 | model = MobileBertForPreTraining(config) 17 | # Load weights from tf checkpoint 18 | model = load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path) 19 | # Save pytorch-model 20 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 21 | torch.save(model.state_dict(), pytorch_dump_path) 22 | 23 | 24 | if __name__ == "__main__": 25 | parser = argparse.ArgumentParser() 26 | # Required parameters 27 | parser.add_argument( 28 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 29 | ) 30 | parser.add_argument( 31 | "--mobilebert_config_file", 32 | default=None, 33 | type=str, 34 | required=True, 35 | help="The config json file corresponding to the pre-trained MobileBERT model. \n" 36 | "This specifies the model architecture.", 37 | ) 38 | parser.add_argument( 39 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 40 | ) 41 | args = parser.parse_args() 42 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.mobilebert_config_file, args.pytorch_dump_path) 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dynamic Connected Networks for Chinese Spelling Check 2 | 3 | This repository provides training code of DCN models for Chinese Spelling Check (CSC). 4 | 5 | The paper has been accepted in Findings of ACL 2021. 6 | 7 | 8 | ## Installation 9 | Our code is based on [transformers 3.0](https://github.com/huggingface/transformers/tree/v3.0.0). 10 | 11 | The following command installs all necessary packages: 12 | ``` 13 | pip install -r requirements.txt 14 | ``` 15 | We test our code using Python 3.6. 16 | 17 | 18 | ## Datasets 19 | The preprocessed training dataset can be downloaded from [here(password:hfiw)](https://pan.baidu.com/s/161ae-g2A7M0KnpJI79hLWg). 20 | 21 | 22 | ## Train Model 23 | To train the DCN model, download the [RoBERTa-wwm-ext](https://github.com/ymcui/Chinese-BERT-wwm) and copy the model to *chinese_roberta_wwm_ext_pytorch*, then run: 24 | ``` 25 | sh train.sh 26 | ``` 27 | 28 | ## Experimental Result 29 | The sentence-level experimental results on SIGHAN15 for the default config are as follows: 30 | 31 | | model | d-p | d-r | d-f | c-p | c-r | c-f | 32 | | - | - | - | - | - | - | - | 33 | | DCN | 76.84 | 79.64 | 78.21 | 74.74 | 77.45 | 76.07 | 34 | 35 | 36 | ## Citation 37 | ``` 38 | @inproceedings{wang-etal-2021-dynamic, 39 | title = "Dynamic Connected Networks for {C}hinese Spelling Check", 40 | author = "Wang, Baoxin and 41 | Che, Wanxiang and 42 | Wu, Dayong and 43 | Wang, Shijin and 44 | Hu, Guoping and 45 | Liu, Ting", 46 | booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021", 47 | month = aug, 48 | year = "2021", 49 | address = "Online", 50 | publisher = "Association for Computational Linguistics", 51 | url = "https://aclanthology.org/2021.findings-acl.216", 52 | doi = "10.18653/v1/2021.findings-acl.216", 53 | pages = "2437--2446", 54 | } 55 | ``` 56 | 57 | ## Related Work 58 | * [CTC 2021](https://github.com/destwang/CTC2021) 59 | * [CTC Resources](https://github.com/destwang/CTCResources) 60 | -------------------------------------------------------------------------------- /transformers/utils_encoder_decoder.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Classes to support Encoder-Decoder architectures """ 16 | 17 | 18 | def prepare_encoder_decoder_model_kwargs(**kwargs): 19 | """ Prepare the encoder and decoder's keyword arguments. 20 | 21 | Keyword arguments come in 3 flavors: 22 | - encoder-specific (prefixed by `encoder_`) 23 | - decoder-specific (prefixed by `decoder_`) 24 | - those that apply to the model as whole. 25 | 26 | We let the specific kwargs override the common ones in case of 27 | conflict. 28 | """ 29 | 30 | kwargs_common = { 31 | argument: value 32 | for argument, value in kwargs.items() 33 | if not argument.startswith("encoder_") and not argument.startswith("decoder_") 34 | } 35 | if "input_ids" in kwargs_common: 36 | kwargs["encoder_input_ids"] = kwargs_common.pop("input_ids") 37 | 38 | decoder_kwargs = kwargs_common.copy() 39 | encoder_kwargs = kwargs_common.copy() 40 | encoder_kwargs.update( 41 | {argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")} 42 | ) 43 | decoder_kwargs.update( 44 | {argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")} 45 | ) 46 | decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None) 47 | return encoder_kwargs, decoder_kwargs 48 | -------------------------------------------------------------------------------- /transformers/configuration_xlm_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XLM-RoBERTa configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_roberta import RobertaConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json", 28 | "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json", 29 | "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json", 30 | "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json", 31 | "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json", 32 | "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json", 33 | } 34 | 35 | 36 | class XLMRobertaConfig(RobertaConfig): 37 | """ 38 | This class overrides :class:`~transformers.RobertaConfig`. Please check the 39 | superclass for the appropriate documentation alongside usage examples. 40 | """ 41 | 42 | model_type = "xlm-roberta" 43 | -------------------------------------------------------------------------------- /transformers/commands/env.py: -------------------------------------------------------------------------------- 1 | import platform 2 | from argparse import ArgumentParser 3 | 4 | from transformers import __version__ as version 5 | from transformers import is_tf_available, is_torch_available 6 | from transformers.commands import BaseTransformersCLICommand 7 | 8 | 9 | def info_command_factory(_): 10 | return EnvironmentCommand() 11 | 12 | 13 | class EnvironmentCommand(BaseTransformersCLICommand): 14 | @staticmethod 15 | def register_subcommand(parser: ArgumentParser): 16 | download_parser = parser.add_parser("env") 17 | download_parser.set_defaults(func=info_command_factory) 18 | 19 | def run(self): 20 | pt_version = "not installed" 21 | pt_cuda_available = "NA" 22 | if is_torch_available(): 23 | import torch 24 | 25 | pt_version = torch.__version__ 26 | pt_cuda_available = torch.cuda.is_available() 27 | 28 | tf_version = "not installed" 29 | tf_cuda_available = "NA" 30 | if is_tf_available(): 31 | import tensorflow as tf 32 | 33 | tf_version = tf.__version__ 34 | try: 35 | # deprecated in v2.1 36 | tf_cuda_available = tf.test.is_gpu_available() 37 | except AttributeError: 38 | # returns list of devices, convert to bool 39 | tf_cuda_available = bool(tf.config.list_physical_devices("GPU")) 40 | 41 | info = { 42 | "`transformers` version": version, 43 | "Platform": platform.platform(), 44 | "Python version": platform.python_version(), 45 | "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available), 46 | "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available), 47 | "Using GPU in script?": "", 48 | "Using distributed or parallel set-up in script?": "", 49 | } 50 | 51 | print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n") 52 | print(self.format_dict(info)) 53 | 54 | return info 55 | 56 | @staticmethod 57 | def format_dict(d): 58 | return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n" 59 | -------------------------------------------------------------------------------- /transformers/convert_t5_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert T5 checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import T5Config, T5Model, load_tf_weights_in_t5 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = T5Config.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = T5Model(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_t5(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained T5 model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /transformers/convert_bert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = BertConfig.from_json_file(bert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = BertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_bert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--bert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained BERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /transformers/convert_albert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert ALBERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = AlbertConfig.from_json_file(albert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = AlbertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_albert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--albert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained ALBERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /transformers/modeling_marian.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 Marian Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch MarianMTModel model, ported from the Marian C++ repo.""" 16 | 17 | 18 | from transformers.modeling_bart import BartForConditionalGeneration 19 | 20 | 21 | MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST = [ 22 | # See all Marian models at https://huggingface.co/models?search=Helsinki-NLP 23 | ] 24 | 25 | 26 | class MarianMTModel(BartForConditionalGeneration): 27 | r""" 28 | Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints. 29 | Model API is identical to BartForConditionalGeneration. 30 | Available models are listed at `Model List `__ 31 | 32 | Examples:: 33 | 34 | >>> from transformers import MarianTokenizer, MarianMTModel 35 | >>> from typing import List 36 | >>> src = 'fr' # source language 37 | >>> trg = 'en' # target language 38 | >>> sample_text = "où est l'arrêt de bus ?" 39 | >>> mname = f'Helsinki-NLP/opus-mt-{src}-{trg}' 40 | 41 | >>> model = MarianMTModel.from_pretrained(mname) 42 | >>> tok = MarianTokenizer.from_pretrained(mname) 43 | >>> batch = tok.prepare_translation_batch(src_texts=[sample_text]) # don't need tgt_text for inference 44 | >>> gen = model.generate(**batch) # for forward pass: model(**batch) 45 | >>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True) # returns "Where is the the bus stop ?" 46 | 47 | """ 48 | 49 | def adjust_logits_during_generation(self, logits, cur_len, max_length): 50 | logits[:, self.config.pad_token_id] = float("-inf") 51 | if cur_len == max_length - 1 and self.config.eos_token_id is not None: 52 | self._force_token_ids_generation(logits, self.config.eos_token_id) 53 | return logits 54 | -------------------------------------------------------------------------------- /transformers/tokenization_longformer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | 18 | from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast 19 | 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | # vocab and merges same as roberta 25 | vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json" 26 | merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt" 27 | _all_longformer_models = [ 28 | "allenai/longformer-base-4096", 29 | "allenai/longformer-large-4096", 30 | "allenai/longformer-large-4096-finetuned-triviaqa", 31 | "allenai/longformer-base-4096-extra.pos.embd.only", 32 | "allenai/longformer-large-4096-extra.pos.embd.only", 33 | ] 34 | 35 | 36 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 37 | "allenai/longformer-base-4096": 4096, 38 | "allenai/longformer-large-4096": 4096, 39 | "allenai/longformer-large-4096-finetuned-triviaqa": 4096, 40 | "allenai/longformer-base-4096-extra.pos.embd.only": 4096, 41 | "allenai/longformer-large-4096-extra.pos.embd.only": 4096, 42 | } 43 | 44 | 45 | class LongformerTokenizer(RobertaTokenizer): 46 | # merges and vocab same as Roberta 47 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 48 | pretrained_vocab_files_map = { 49 | "vocab_file": {m: vocab_url for m in _all_longformer_models}, 50 | "merges_file": {m: merges_url for m in _all_longformer_models}, 51 | } 52 | 53 | 54 | class LongformerTokenizerFast(RobertaTokenizerFast): 55 | # merges and vocab same as Roberta 56 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 57 | pretrained_vocab_files_map = { 58 | "vocab_file": {m: vocab_url for m in _all_longformer_models}, 59 | "merges_file": {m: merges_url for m in _all_longformer_models}, 60 | } 61 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # model & dataset 2 | *.bin 3 | train.txt 4 | dcn_models/ 5 | runs/ 6 | 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | pip-wheel-metadata/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | -------------------------------------------------------------------------------- /transformers/training_args_tf.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from dataclasses import dataclass, field 3 | from typing import Tuple 4 | 5 | from .file_utils import cached_property, is_tf_available, tf_required 6 | from .training_args import TrainingArguments 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | if is_tf_available(): 12 | import tensorflow as tf 13 | 14 | 15 | @dataclass 16 | class TFTrainingArguments(TrainingArguments): 17 | tpu_name: str = field( 18 | default=None, metadata={"help": "Name of TPU"}, 19 | ) 20 | eval_steps: int = field(default=1000, metadata={"help": "Run an evaluation every X steps."}) 21 | debug: bool = field( 22 | default=False, metadata={"help": "Activate the trace to record computation graphs and profiling information"} 23 | ) 24 | 25 | @cached_property 26 | @tf_required 27 | def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", int]: 28 | logger.info("Tensorflow: setting up strategy") 29 | gpus = tf.config.list_physical_devices("GPU") 30 | 31 | if self.no_cuda: 32 | strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") 33 | else: 34 | try: 35 | if self.tpu_name: 36 | tpu = tf.distribute.cluster_resolver.TPUClusterResolver(self.tpu_name) 37 | else: 38 | tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 39 | except ValueError: 40 | tpu = None 41 | 42 | if tpu: 43 | tf.config.experimental_connect_to_cluster(tpu) 44 | tf.tpu.experimental.initialize_tpu_system(tpu) 45 | 46 | strategy = tf.distribute.experimental.TPUStrategy(tpu) 47 | elif len(gpus) == 0: 48 | strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") 49 | elif len(gpus) == 1: 50 | strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0") 51 | elif len(gpus) > 1: 52 | # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` 53 | strategy = tf.distribute.MirroredStrategy() 54 | else: 55 | raise ValueError("Cannot find the proper strategy please check your environment properties.") 56 | 57 | return strategy 58 | 59 | @property 60 | @tf_required 61 | def strategy(self) -> "tf.distribute.Strategy": 62 | return self._setup_strategy 63 | 64 | @property 65 | @tf_required 66 | def n_gpu(self) -> int: 67 | return self._setup_strategy.num_replicas_in_sync 68 | -------------------------------------------------------------------------------- /transformers/tokenization_mobilebert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Tokenization classes for MobileBERT.""" 15 | 16 | 17 | import logging 18 | 19 | from .tokenization_bert import BertTokenizer, BertTokenizerFast 20 | 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 25 | 26 | PRETRAINED_VOCAB_FILES_MAP = { 27 | "vocab_file": { 28 | "mobilebert-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/google/mobilebert-uncased/vocab.txt" 29 | } 30 | } 31 | 32 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} 33 | 34 | 35 | PRETRAINED_INIT_CONFIGURATION = {} 36 | 37 | 38 | class MobileBertTokenizer(BertTokenizer): 39 | r""" 40 | Constructs a MobileBertTokenizer. 41 | 42 | :class:`~transformers.MobileBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 43 | tokenization: punctuation splitting + wordpiece. 44 | 45 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 46 | parameters. 47 | """ 48 | 49 | vocab_files_names = VOCAB_FILES_NAMES 50 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 51 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 52 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 53 | 54 | 55 | class MobileBertTokenizerFast(BertTokenizerFast): 56 | r""" 57 | Constructs a "Fast" MobileBertTokenizer (backed by HuggingFace's `tokenizers` library). 58 | 59 | :class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end 60 | tokenization: punctuation splitting + wordpiece. 61 | 62 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning 63 | parameters. 64 | """ 65 | 66 | vocab_files_names = VOCAB_FILES_NAMES 67 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 68 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 69 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 70 | -------------------------------------------------------------------------------- /transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): 30 | # Construct model 31 | if gpt2_config_file == "": 32 | config = GPT2Config() 33 | else: 34 | config = GPT2Config.from_json_file(gpt2_config_file) 35 | model = GPT2Model(config) 36 | 37 | # Load weights from numpy 38 | load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) 39 | 40 | # Save pytorch-model 41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 44 | torch.save(model.state_dict(), pytorch_weights_dump_path) 45 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 47 | f.write(config.to_json_string()) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | # Required parameters 53 | parser.add_argument( 54 | "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 55 | ) 56 | parser.add_argument( 57 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 58 | ) 59 | parser.add_argument( 60 | "--gpt2_config_file", 61 | default="", 62 | type=str, 63 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n" 64 | "This specifies the model architecture.", 65 | ) 66 | args = parser.parse_args() 67 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path) 68 | -------------------------------------------------------------------------------- /transformers/tokenization_retribert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for RetriBERT.""" 16 | 17 | 18 | import logging 19 | 20 | from .tokenization_bert import BertTokenizer, BertTokenizerFast 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 26 | 27 | PRETRAINED_VOCAB_FILES_MAP = { 28 | "vocab_file": { 29 | "yjernite/retribert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 30 | } 31 | } 32 | 33 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 34 | "yjernite/retribert-base-uncased": 512, 35 | } 36 | 37 | 38 | PRETRAINED_INIT_CONFIGURATION = { 39 | "yjernite/retribert-base-uncased": {"do_lower_case": True}, 40 | } 41 | 42 | 43 | class RetriBertTokenizer(BertTokenizer): 44 | r""" 45 | Constructs a retribert. 46 | 47 | :class:`~transformers.retribert is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 48 | tokenization: punctuation splitting + wordpiece. 49 | 50 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 51 | parameters. 52 | """ 53 | 54 | vocab_files_names = VOCAB_FILES_NAMES 55 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 56 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 57 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 58 | model_input_names = ["attention_mask"] 59 | 60 | 61 | class RetriBertTokenizerFast(BertTokenizerFast): 62 | r""" 63 | Constructs a "Fast" RetriBertTokenizerFast (backed by HuggingFace's `tokenizers` library). 64 | 65 | :class:`~transformers.RetriBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end 66 | tokenization: punctuation splitting + wordpiece. 67 | 68 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning 69 | parameters. 70 | """ 71 | 72 | vocab_files_names = VOCAB_FILES_NAMES 73 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 74 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 75 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 76 | model_input_names = ["attention_mask"] 77 | -------------------------------------------------------------------------------- /transformers/convert_openai_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): 30 | # Construct model 31 | if openai_config_file == "": 32 | config = OpenAIGPTConfig() 33 | else: 34 | config = OpenAIGPTConfig.from_json_file(openai_config_file) 35 | model = OpenAIGPTModel(config) 36 | 37 | # Load weights from numpy 38 | load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path) 39 | 40 | # Save pytorch-model 41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 44 | torch.save(model.state_dict(), pytorch_weights_dump_path) 45 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 47 | f.write(config.to_json_string()) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | # Required parameters 53 | parser.add_argument( 54 | "--openai_checkpoint_folder_path", 55 | default=None, 56 | type=str, 57 | required=True, 58 | help="Path to the TensorFlow checkpoint path.", 59 | ) 60 | parser.add_argument( 61 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 62 | ) 63 | parser.add_argument( 64 | "--openai_config_file", 65 | default="", 66 | type=str, 67 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n" 68 | "This specifies the model architecture.", 69 | ) 70 | args = parser.parse_args() 71 | convert_openai_checkpoint_to_pytorch( 72 | args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path 73 | ) 74 | -------------------------------------------------------------------------------- /transformers/benchmark/benchmark_args.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import logging 18 | from dataclasses import dataclass, field 19 | from typing import Tuple 20 | 21 | from ..file_utils import cached_property, is_torch_available, is_torch_tpu_available, torch_required 22 | from .benchmark_args_utils import BenchmarkArguments 23 | 24 | 25 | if is_torch_available(): 26 | import torch 27 | 28 | if is_torch_tpu_available(): 29 | import torch_xla.core.xla_model as xm 30 | 31 | 32 | logger = logging.getLogger(__name__) 33 | 34 | 35 | @dataclass 36 | class PyTorchBenchmarkArguments(BenchmarkArguments): 37 | torchscript: bool = field(default=False, metadata={"help": "Trace the models using torchscript"}) 38 | torch_xla_tpu_print_metrics: bool = field(default=False, metadata={"help": "Print Xla/PyTorch tpu metrics"}) 39 | fp16_opt_level: str = field( 40 | default="O1", 41 | metadata={ 42 | "help": ( 43 | "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 44 | "See details at https://nvidia.github.io/apex/amp.html" 45 | ) 46 | }, 47 | ) 48 | 49 | @cached_property 50 | @torch_required 51 | def _setup_devices(self) -> Tuple["torch.device", int]: 52 | logger.info("PyTorch: setting up devices") 53 | if self.no_cuda: 54 | device = torch.device("cpu") 55 | n_gpu = 0 56 | elif is_torch_tpu_available(): 57 | device = xm.xla_device() 58 | n_gpu = 0 59 | else: 60 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 61 | n_gpu = torch.cuda.device_count() 62 | return device, n_gpu 63 | 64 | @property 65 | def is_tpu(self): 66 | return is_torch_tpu_available() and not self.no_tpu 67 | 68 | @property 69 | @torch_required 70 | def device_idx(self) -> int: 71 | # TODO(PVP): currently only single GPU is supported 72 | return torch.cuda.current_device() 73 | 74 | @property 75 | @torch_required 76 | def device(self) -> "torch.device": 77 | return self._setup_devices[0] 78 | 79 | @property 80 | @torch_required 81 | def n_gpu(self): 82 | return self._setup_devices[1] 83 | 84 | @property 85 | def is_gpu(self): 86 | return self.n_gpu > 0 87 | -------------------------------------------------------------------------------- /transformers/convert_electra_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert ELECTRA checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator): 30 | # Initialise PyTorch model 31 | config = ElectraConfig.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | 34 | if discriminator_or_generator == "discriminator": 35 | model = ElectraForPreTraining(config) 36 | elif discriminator_or_generator == "generator": 37 | model = ElectraForMaskedLM(config) 38 | else: 39 | raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'") 40 | 41 | # Load weights from tf checkpoint 42 | load_tf_weights_in_electra( 43 | model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator 44 | ) 45 | 46 | # Save pytorch-model 47 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 48 | torch.save(model.state_dict(), pytorch_dump_path) 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | # Required parameters 54 | parser.add_argument( 55 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 56 | ) 57 | parser.add_argument( 58 | "--config_file", 59 | default=None, 60 | type=str, 61 | required=True, 62 | help="The config json file corresponding to the pre-trained model. \n" 63 | "This specifies the model architecture.", 64 | ) 65 | parser.add_argument( 66 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 67 | ) 68 | parser.add_argument( 69 | "--discriminator_or_generator", 70 | default=None, 71 | type=str, 72 | required=True, 73 | help="Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or " 74 | "'generator'.", 75 | ) 76 | args = parser.parse_args() 77 | convert_tf_checkpoint_to_pytorch( 78 | args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator 79 | ) 80 | -------------------------------------------------------------------------------- /transformers/data/datasets/dcn_language_modeling.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pickle 4 | import time 5 | 6 | import torch 7 | from filelock import FileLock 8 | from torch.utils.data.dataset import Dataset 9 | 10 | from ...tokenization_utils import PreTrainedTokenizer 11 | import random 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class PinyinShuffleLineByLineTextDataset(Dataset): 17 | """ 18 | This will be superseded by a framework-agnostic approach 19 | soon. 20 | """ 21 | def __init__(self, 22 | tokenizer: PreTrainedTokenizer, 23 | file_path: str, 24 | block_size: int, 25 | shuffle=True): 26 | shuffle = False 27 | print(file_path, os.path.isfile(file_path)) 28 | assert os.path.isfile(file_path) 29 | # Here, we do not cache the features, operating under the assumption 30 | # that we will soon use fast multithreaded tokenizers from the 31 | # `tokenizers` repo everywhere =) 32 | logger.info("Creating features from dataset file at %s", file_path) 33 | 34 | self.count = 0 35 | self.file_path = file_path 36 | self.block_size = block_size 37 | self.tokenizer = tokenizer 38 | with open(file_path, encoding="utf-8") as f: 39 | for line in f: 40 | line = line.strip() 41 | if len(line.split('\t')) >= 4: 42 | self.count += 1 43 | self.input_file = open(file_path, encoding="utf-8") 44 | self.lines = self.input_file.readlines() 45 | if shuffle: 46 | random.shuffle(self.lines) 47 | 48 | def __len__(self): 49 | return self.count 50 | 51 | def __getitem__(self, i) -> torch.Tensor: 52 | block_size = self.block_size 53 | line = self.lines[i] 54 | line = line.strip() 55 | 56 | line_input, line_label, line_mask, line_pos = line.split('\t')[:4] 57 | line_input_items = line_input.split() 58 | line_label_items = line_label.split() 59 | line_mask_items = line_mask.split() 60 | line_pos_items = line_pos.split() 61 | assert len(line_input_items) == len(line_label_items) == len( 62 | line_mask_items) == len(line_pos_items) 63 | input_ids = self.tokenizer.convert_tokens_to_ids( 64 | ["[CLS]"] + line_input_items[:block_size - 2] + ["[SEP]"]) 65 | label_ids = self.tokenizer.convert_tokens_to_ids( 66 | ["[CLS]"] + line_label_items[:block_size - 2] + ["[SEP]"]) 67 | 68 | mask_ids = [0] + [ 69 | int(item) for item in line_mask_items[:block_size - 2] 70 | ] + [0] 71 | pos_ids = [0 72 | ] + [int(item) 73 | for item in line_pos_items[:block_size - 2]] + [0] 74 | assert len(input_ids) == len(label_ids) == len(mask_ids) == len( 75 | pos_ids) 76 | return torch.tensor(input_ids, dtype=torch.long), torch.tensor( 77 | label_ids, dtype=torch.long), torch.tensor( 78 | mask_ids, dtype=torch.long), torch.tensor(pos_ids, 79 | dtype=torch.long) 80 | 81 | -------------------------------------------------------------------------------- /transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import json 20 | import logging 21 | 22 | import numpy 23 | import torch 24 | 25 | from transformers import CONFIG_NAME, WEIGHTS_NAME 26 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES 27 | 28 | 29 | logging.basicConfig(level=logging.INFO) 30 | 31 | 32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path): 33 | # Load checkpoint 34 | chkpt = torch.load(xlm_checkpoint_path, map_location="cpu") 35 | 36 | state_dict = chkpt["model"] 37 | 38 | # We have the base model one level deeper than the original XLM repository 39 | two_levels_state_dict = {} 40 | for k, v in state_dict.items(): 41 | if "pred_layer" in k: 42 | two_levels_state_dict[k] = v 43 | else: 44 | two_levels_state_dict["transformer." + k] = v 45 | 46 | config = chkpt["params"] 47 | config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))) 48 | 49 | vocab = chkpt["dico_word2id"] 50 | vocab = dict((s + "" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items()) 51 | 52 | # Save pytorch-model 53 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 54 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 55 | pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"] 56 | 57 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 58 | torch.save(two_levels_state_dict, pytorch_weights_dump_path) 59 | 60 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 61 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 62 | f.write(json.dumps(config, indent=2) + "\n") 63 | 64 | print("Save vocab file to {}".format(pytorch_config_dump_path)) 65 | with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f: 66 | f.write(json.dumps(vocab, indent=2) + "\n") 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser() 71 | # Required parameters 72 | parser.add_argument( 73 | "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." 74 | ) 75 | parser.add_argument( 76 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 77 | ) 78 | args = parser.parse_args() 79 | convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path) 80 | -------------------------------------------------------------------------------- /transformers/data/processors/xnli.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XNLI utils (dataset loading and evaluation) """ 17 | 18 | 19 | import logging 20 | import os 21 | 22 | from .utils import DataProcessor, InputExample 23 | 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | class XnliProcessor(DataProcessor): 29 | """Processor for the XNLI dataset. 30 | Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207""" 31 | 32 | def __init__(self, language, train_language=None): 33 | self.language = language 34 | self.train_language = train_language 35 | 36 | def get_train_examples(self, data_dir): 37 | """See base class.""" 38 | lg = self.language if self.train_language is None else self.train_language 39 | lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg))) 40 | examples = [] 41 | for (i, line) in enumerate(lines): 42 | if i == 0: 43 | continue 44 | guid = "%s-%s" % ("train", i) 45 | text_a = line[0] 46 | text_b = line[1] 47 | label = "contradiction" if line[2] == "contradictory" else line[2] 48 | assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) 49 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 50 | return examples 51 | 52 | def get_test_examples(self, data_dir): 53 | """See base class.""" 54 | lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv")) 55 | examples = [] 56 | for (i, line) in enumerate(lines): 57 | if i == 0: 58 | continue 59 | language = line[0] 60 | if language != self.language: 61 | continue 62 | guid = "%s-%s" % ("test", i) 63 | text_a = line[6] 64 | text_b = line[7] 65 | label = line[1] 66 | assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) 67 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 68 | return examples 69 | 70 | def get_labels(self): 71 | """See base class.""" 72 | return ["contradiction", "entailment", "neutral"] 73 | 74 | 75 | xnli_processors = { 76 | "xnli": XnliProcessor, 77 | } 78 | 79 | xnli_output_modes = { 80 | "xnli": "classification", 81 | } 82 | 83 | xnli_tasks_num_labels = { 84 | "xnli": 3, 85 | } 86 | -------------------------------------------------------------------------------- /transformers/data/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | try: 18 | from scipy.stats import pearsonr, spearmanr 19 | from sklearn.metrics import matthews_corrcoef, f1_score 20 | 21 | _has_sklearn = True 22 | except (AttributeError, ImportError): 23 | _has_sklearn = False 24 | 25 | 26 | def is_sklearn_available(): 27 | return _has_sklearn 28 | 29 | 30 | if _has_sklearn: 31 | 32 | def simple_accuracy(preds, labels): 33 | return (preds == labels).mean() 34 | 35 | def acc_and_f1(preds, labels): 36 | acc = simple_accuracy(preds, labels) 37 | f1 = f1_score(y_true=labels, y_pred=preds) 38 | return { 39 | "acc": acc, 40 | "f1": f1, 41 | "acc_and_f1": (acc + f1) / 2, 42 | } 43 | 44 | def pearson_and_spearman(preds, labels): 45 | pearson_corr = pearsonr(preds, labels)[0] 46 | spearman_corr = spearmanr(preds, labels)[0] 47 | return { 48 | "pearson": pearson_corr, 49 | "spearmanr": spearman_corr, 50 | "corr": (pearson_corr + spearman_corr) / 2, 51 | } 52 | 53 | def glue_compute_metrics(task_name, preds, labels): 54 | assert len(preds) == len(labels) 55 | if task_name == "cola": 56 | return {"mcc": matthews_corrcoef(labels, preds)} 57 | elif task_name == "sst-2": 58 | return {"acc": simple_accuracy(preds, labels)} 59 | elif task_name == "mrpc": 60 | return acc_and_f1(preds, labels) 61 | elif task_name == "sts-b": 62 | return pearson_and_spearman(preds, labels) 63 | elif task_name == "qqp": 64 | return acc_and_f1(preds, labels) 65 | elif task_name == "mnli": 66 | return {"mnli/acc": simple_accuracy(preds, labels)} 67 | elif task_name == "mnli-mm": 68 | return {"mnli-mm/acc": simple_accuracy(preds, labels)} 69 | elif task_name == "qnli": 70 | return {"acc": simple_accuracy(preds, labels)} 71 | elif task_name == "rte": 72 | return {"acc": simple_accuracy(preds, labels)} 73 | elif task_name == "wnli": 74 | return {"acc": simple_accuracy(preds, labels)} 75 | elif task_name == "hans": 76 | return {"acc": simple_accuracy(preds, labels)} 77 | else: 78 | raise KeyError(task_name) 79 | 80 | def xnli_compute_metrics(task_name, preds, labels): 81 | assert len(preds) == len(labels) 82 | if task_name == "xnli": 83 | return {"acc": simple_accuracy(preds, labels)} 84 | else: 85 | raise KeyError(task_name) 86 | -------------------------------------------------------------------------------- /transformers/convert_longformer_original_pytorch_lightning_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert RoBERTa checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import pytorch_lightning as pl 21 | import torch 22 | 23 | from transformers.modeling_longformer import LongformerForQuestionAnswering, LongformerModel 24 | 25 | 26 | class LightningModel(pl.LightningModule): 27 | def __init__(self, model): 28 | super().__init__() 29 | self.model = model 30 | self.num_labels = 2 31 | self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels) 32 | 33 | # implement only because lighning requires to do so 34 | def forward(self): 35 | pass 36 | 37 | 38 | def convert_longformer_qa_checkpoint_to_pytorch( 39 | longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str 40 | ): 41 | 42 | # load longformer model from model identifier 43 | longformer = LongformerModel.from_pretrained(longformer_model) 44 | lightning_model = LightningModel(longformer) 45 | 46 | ckpt = torch.load(longformer_question_answering_ckpt_path, map_location=torch.device("cpu")) 47 | lightning_model.load_state_dict(ckpt["state_dict"]) 48 | 49 | # init longformer question answering model 50 | longformer_for_qa = LongformerForQuestionAnswering.from_pretrained(longformer_model) 51 | 52 | # transfer weights 53 | longformer_for_qa.longformer.load_state_dict(lightning_model.model.state_dict()) 54 | longformer_for_qa.qa_outputs.load_state_dict(lightning_model.qa_outputs.state_dict()) 55 | longformer_for_qa.eval() 56 | 57 | # save model 58 | longformer_for_qa.save_pretrained(pytorch_dump_folder_path) 59 | 60 | print("Conversion succesful. Model saved under {}".format(pytorch_dump_folder_path)) 61 | 62 | 63 | if __name__ == "__main__": 64 | parser = argparse.ArgumentParser() 65 | # Required parameters 66 | parser.add_argument( 67 | "--longformer_model", 68 | default=None, 69 | type=str, 70 | required=True, 71 | help="model identifier of longformer. Should be either `longformer-base-4096` or `longformer-large-4096`.", 72 | ) 73 | parser.add_argument( 74 | "--longformer_question_answering_ckpt_path", 75 | default=None, 76 | type=str, 77 | required=True, 78 | help="Path the official PyTorch Lighning Checkpoint.", 79 | ) 80 | parser.add_argument( 81 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 82 | ) 83 | args = parser.parse_args() 84 | convert_longformer_qa_checkpoint_to_pytorch( 85 | args.longformer_model, args.longformer_question_answering_ckpt_path, args.pytorch_dump_folder_path 86 | ) 87 | -------------------------------------------------------------------------------- /transformers/configuration_longformer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Longformer configuration """ 16 | 17 | import logging 18 | from typing import List, Union 19 | 20 | from .configuration_roberta import RobertaConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = { 26 | "allenai/longformer-base-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096/config.json", 27 | "allenai/longformer-large-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096/config.json", 28 | "allenai/longformer-large-4096-finetuned-triviaqa": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-finetuned-triviaqa/config.json", 29 | "allenai/longformer-base-4096-extra.pos.embd.only": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096-extra.pos.embd.only/config.json", 30 | "allenai/longformer-large-4096-extra.pos.embd.only": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-extra.pos.embd.only/config.json", 31 | } 32 | 33 | 34 | class LongformerConfig(RobertaConfig): 35 | r""" 36 | This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`. 37 | It is used to instantiate an Longformer model according to the specified arguments, defining the model 38 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of 39 | the RoBERTa `roberta-base `__ architecture with a sequence length 4,096. 40 | 41 | The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`. 42 | It reuses the same defaults. Please check the parent class for more information. 43 | 44 | Args: 45 | attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512): 46 | Size of an attention window around each token. If :obj:`int`, use the same size for all layers. 47 | To specify a different window size for each layer, use a :obj:`List[int]` where 48 | ``len(attention_window) == num_hidden_layers``. 49 | 50 | Example:: 51 | 52 | >>> from transformers import LongformerConfig, LongformerModel 53 | 54 | >>> # Initializing a Longformer configuration 55 | >>> configuration = LongformerConfig() 56 | 57 | >>> # Initializing a model from the configuration 58 | >>> model = LongformerModel(configuration) 59 | 60 | >>> # Accessing the model configuration 61 | >>> configuration = model.config 62 | """ 63 | model_type = "longformer" 64 | 65 | def __init__(self, attention_window: Union[List[int], int] = 512, sep_token_id: int = 2, **kwargs): 66 | super().__init__(**kwargs) 67 | self.attention_window = attention_window 68 | self.sep_token_id = sep_token_id 69 | -------------------------------------------------------------------------------- /transformers/configuration_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ RoBERTa configuration """ 17 | 18 | import logging 19 | 20 | from .configuration_bert import BertConfig 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 25 | "roberta-base": 26 | "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", 27 | "roberta-large": 28 | "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", 29 | "roberta-large-mnli": 30 | "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", 31 | "distilroberta-base": 32 | "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json", 33 | "roberta-base-openai-detector": 34 | "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json", 35 | "roberta-large-openai-detector": 36 | "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json", 37 | } 38 | 39 | 40 | class RobertaConfig(BertConfig): 41 | r""" 42 | This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel`. 43 | It is used to instantiate an RoBERTa model according to the specified arguments, defining the model 44 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of 45 | the BERT `bert-base-uncased `__ architecture. 46 | 47 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used 48 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` 49 | for more information. 50 | 51 | The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. 52 | It reuses the same defaults. Please check the parent class for more information. 53 | 54 | Example:: 55 | 56 | >>> from transformers import RobertaConfig, RobertaModel 57 | 58 | >>> # Initializing a RoBERTa configuration 59 | >>> configuration = RobertaConfig() 60 | 61 | >>> # Initializing a model from the configuration 62 | >>> model = RobertaModel(configuration) 63 | 64 | >>> # Accessing the model configuration 65 | >>> configuration = model.config 66 | """ 67 | model_type = "roberta" 68 | n_autoregressive = 3 69 | is_resnet = True 70 | 71 | def __init__(self, 72 | pad_token_id=1, 73 | bos_token_id=0, 74 | eos_token_id=2, 75 | **kwargs): 76 | """Constructs RobertaConfig. 77 | """ 78 | super().__init__(pad_token_id=pad_token_id, 79 | bos_token_id=bos_token_id, 80 | eos_token_id=eos_token_id, 81 | **kwargs) 82 | -------------------------------------------------------------------------------- /vocab/pinyin_vocab.txt: -------------------------------------------------------------------------------- 1 | [OTHER] 2 | a 3 | ai 4 | an 5 | ang 6 | ao 7 | ba 8 | bai 9 | ban 10 | bang 11 | bao 12 | bei 13 | ben 14 | beng 15 | bi 16 | bian 17 | biao 18 | bie 19 | bin 20 | bing 21 | bo 22 | bu 23 | ca 24 | cai 25 | can 26 | cang 27 | cao 28 | ce 29 | cen 30 | ceng 31 | cha 32 | chai 33 | chan 34 | chang 35 | chao 36 | che 37 | chen 38 | cheng 39 | chi 40 | chong 41 | chou 42 | chu 43 | chuai 44 | chuan 45 | chuang 46 | chui 47 | chun 48 | chuo 49 | ci 50 | cong 51 | cou 52 | cu 53 | cuan 54 | cui 55 | cun 56 | cuo 57 | da 58 | dai 59 | dan 60 | dang 61 | dao 62 | de 63 | deng 64 | di 65 | dian 66 | diao 67 | die 68 | ding 69 | diu 70 | dong 71 | dou 72 | du 73 | duan 74 | dui 75 | dun 76 | duo 77 | e 78 | ei 79 | en 80 | er 81 | fa 82 | fan 83 | fang 84 | fei 85 | fen 86 | feng 87 | fo 88 | fou 89 | fu 90 | ga 91 | gai 92 | gan 93 | gang 94 | gao 95 | ge 96 | gei 97 | gen 98 | geng 99 | gong 100 | gou 101 | gu 102 | gua 103 | guai 104 | guan 105 | guang 106 | gui 107 | gun 108 | guo 109 | ha 110 | hai 111 | han 112 | hang 113 | hao 114 | he 115 | hei 116 | hen 117 | heng 118 | hong 119 | hou 120 | hu 121 | hua 122 | huai 123 | huan 124 | huang 125 | hui 126 | hun 127 | huo 128 | ji 129 | jia 130 | jian 131 | jiang 132 | jiao 133 | jie 134 | jin 135 | jing 136 | jiong 137 | jiu 138 | ju 139 | juan 140 | jue 141 | jun 142 | ka 143 | kai 144 | kan 145 | kang 146 | kao 147 | ke 148 | ken 149 | keng 150 | kong 151 | kou 152 | ku 153 | kua 154 | kuai 155 | kuan 156 | kuang 157 | kui 158 | kun 159 | kuo 160 | la 161 | lai 162 | lan 163 | lang 164 | lao 165 | le 166 | lei 167 | leng 168 | li 169 | lia 170 | lian 171 | liang 172 | liao 173 | lie 174 | lin 175 | ling 176 | liu 177 | long 178 | lou 179 | lu 180 | luan 181 | lun 182 | luo 183 | lv 184 | lve 185 | ma 186 | mai 187 | man 188 | mang 189 | mao 190 | me 191 | mei 192 | men 193 | meng 194 | mi 195 | mian 196 | miao 197 | mie 198 | min 199 | ming 200 | miu 201 | mo 202 | mou 203 | mu 204 | n 205 | na 206 | nai 207 | nan 208 | nang 209 | nao 210 | ne 211 | nei 212 | nen 213 | neng 214 | ni 215 | nian 216 | niang 217 | niao 218 | nie 219 | nin 220 | ning 221 | niu 222 | nong 223 | nou 224 | nu 225 | nuan 226 | nuo 227 | nv 228 | nve 229 | o 230 | ou 231 | pa 232 | pai 233 | pan 234 | pang 235 | pao 236 | pei 237 | pen 238 | peng 239 | pi 240 | pian 241 | piao 242 | pie 243 | pin 244 | ping 245 | po 246 | pou 247 | pu 248 | qi 249 | qia 250 | qian 251 | qiang 252 | qiao 253 | qie 254 | qin 255 | qing 256 | qiong 257 | qiu 258 | qu 259 | quan 260 | que 261 | qun 262 | ran 263 | rang 264 | rao 265 | re 266 | ren 267 | reng 268 | ri 269 | rong 270 | rou 271 | ru 272 | ruan 273 | rui 274 | run 275 | ruo 276 | sa 277 | sai 278 | san 279 | sang 280 | sao 281 | se 282 | sen 283 | seng 284 | sha 285 | shai 286 | shan 287 | shang 288 | shao 289 | she 290 | shei 291 | shen 292 | sheng 293 | shi 294 | shou 295 | shu 296 | shua 297 | shuai 298 | shuan 299 | shuang 300 | shui 301 | shun 302 | shuo 303 | si 304 | song 305 | sou 306 | su 307 | suan 308 | sui 309 | sun 310 | suo 311 | ta 312 | tai 313 | tan 314 | tang 315 | tao 316 | te 317 | teng 318 | ti 319 | tian 320 | tiao 321 | tie 322 | ting 323 | tong 324 | tou 325 | tu 326 | tuan 327 | tui 328 | tun 329 | tuo 330 | wa 331 | wai 332 | wan 333 | wang 334 | wei 335 | wen 336 | weng 337 | wo 338 | wu 339 | xi 340 | xia 341 | xian 342 | xiang 343 | xiao 344 | xie 345 | xin 346 | xing 347 | xiong 348 | xiu 349 | xu 350 | xuan 351 | xue 352 | xun 353 | ya 354 | yan 355 | yang 356 | yao 357 | ye 358 | yi 359 | yin 360 | ying 361 | yo 362 | yong 363 | you 364 | yu 365 | yuan 366 | yue 367 | yun 368 | za 369 | zai 370 | zan 371 | zang 372 | zao 373 | ze 374 | zei 375 | zen 376 | zeng 377 | zha 378 | zhai 379 | zhan 380 | zhang 381 | zhao 382 | zhe 383 | zhen 384 | zheng 385 | zhi 386 | zhong 387 | zhou 388 | zhu 389 | zhua 390 | zhuai 391 | zhuan 392 | zhuang 393 | zhui 394 | zhun 395 | zhuo 396 | zi 397 | zong 398 | zou 399 | zu 400 | zuan 401 | zui 402 | zun 403 | zuo 404 | den 405 | -------------------------------------------------------------------------------- /transformers/commands/run.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from argparse import ArgumentParser 3 | 4 | from transformers.commands import BaseTransformersCLICommand 5 | from transformers.pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline 6 | 7 | 8 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 9 | 10 | 11 | def try_infer_format_from_ext(path: str): 12 | if not path: 13 | return "pipe" 14 | 15 | for ext in PipelineDataFormat.SUPPORTED_FORMATS: 16 | if path.endswith(ext): 17 | return ext 18 | 19 | raise Exception( 20 | "Unable to determine file format from file extension {}. " 21 | "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS) 22 | ) 23 | 24 | 25 | def run_command_factory(args): 26 | nlp = pipeline( 27 | task=args.task, 28 | model=args.model if args.model else None, 29 | config=args.config, 30 | tokenizer=args.tokenizer, 31 | device=args.device, 32 | ) 33 | format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format 34 | reader = PipelineDataFormat.from_str( 35 | format=format, 36 | output_path=args.output, 37 | input_path=args.input, 38 | column=args.column if args.column else nlp.default_input_names, 39 | overwrite=args.overwrite, 40 | ) 41 | return RunCommand(nlp, reader) 42 | 43 | 44 | class RunCommand(BaseTransformersCLICommand): 45 | def __init__(self, nlp: Pipeline, reader: PipelineDataFormat): 46 | self._nlp = nlp 47 | self._reader = reader 48 | 49 | @staticmethod 50 | def register_subcommand(parser: ArgumentParser): 51 | run_parser = parser.add_parser("run", help="Run a pipeline through the CLI") 52 | run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run") 53 | run_parser.add_argument("--input", type=str, help="Path to the file to use for inference") 54 | run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.") 55 | run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.") 56 | run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.") 57 | run_parser.add_argument( 58 | "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)" 59 | ) 60 | run_parser.add_argument( 61 | "--column", 62 | type=str, 63 | help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)", 64 | ) 65 | run_parser.add_argument( 66 | "--format", 67 | type=str, 68 | default="infer", 69 | choices=PipelineDataFormat.SUPPORTED_FORMATS, 70 | help="Input format to read from", 71 | ) 72 | run_parser.add_argument( 73 | "--device", 74 | type=int, 75 | default=-1, 76 | help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)", 77 | ) 78 | run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.") 79 | run_parser.set_defaults(func=run_command_factory) 80 | 81 | def run(self): 82 | nlp, outputs = self._nlp, [] 83 | 84 | for entry in self._reader: 85 | output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry) 86 | if isinstance(output, dict): 87 | outputs.append(output) 88 | else: 89 | outputs += output 90 | 91 | # Saving data 92 | if self._nlp.binary_output: 93 | binary_path = self._reader.save_binary(outputs) 94 | logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path)) 95 | else: 96 | self._reader.save(outputs) 97 | -------------------------------------------------------------------------------- /transformers/tokenization_electra.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .tokenization_bert import BertTokenizer, BertTokenizerFast 17 | 18 | 19 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 20 | 21 | PRETRAINED_VOCAB_FILES_MAP = { 22 | "vocab_file": { 23 | "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/vocab.txt", 24 | "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/vocab.txt", 25 | "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/vocab.txt", 26 | "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/vocab.txt", 27 | "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt", 28 | "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/vocab.txt", 29 | } 30 | } 31 | 32 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 33 | "google/electra-small-generator": 512, 34 | "google/electra-base-generator": 512, 35 | "google/electra-large-generator": 512, 36 | "google/electra-small-discriminator": 512, 37 | "google/electra-base-discriminator": 512, 38 | "google/electra-large-discriminator": 512, 39 | } 40 | 41 | 42 | PRETRAINED_INIT_CONFIGURATION = { 43 | "google/electra-small-generator": {"do_lower_case": True}, 44 | "google/electra-base-generator": {"do_lower_case": True}, 45 | "google/electra-large-generator": {"do_lower_case": True}, 46 | "google/electra-small-discriminator": {"do_lower_case": True}, 47 | "google/electra-base-discriminator": {"do_lower_case": True}, 48 | "google/electra-large-discriminator": {"do_lower_case": True}, 49 | } 50 | 51 | 52 | class ElectraTokenizer(BertTokenizer): 53 | r""" 54 | Constructs an Electra tokenizer. 55 | :class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 56 | tokenization: punctuation splitting + wordpiece. 57 | 58 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 59 | parameters. 60 | """ 61 | 62 | vocab_files_names = VOCAB_FILES_NAMES 63 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 64 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 65 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 66 | 67 | 68 | class ElectraTokenizerFast(BertTokenizerFast): 69 | r""" 70 | Constructs a "Fast" Electra Fast tokenizer (backed by HuggingFace's `tokenizers` library). 71 | 72 | :class:`~transformers.ElectraTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end 73 | tokenization: punctuation splitting + wordpiece. 74 | 75 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning 76 | parameters. 77 | """ 78 | vocab_files_names = VOCAB_FILES_NAMES 79 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 80 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 81 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 82 | -------------------------------------------------------------------------------- /transformers/benchmark/benchmark_args_tf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import logging 18 | from dataclasses import dataclass, field 19 | from typing import Tuple 20 | 21 | from ..file_utils import cached_property, is_tf_available, tf_required 22 | from .benchmark_args_utils import BenchmarkArguments 23 | 24 | 25 | if is_tf_available(): 26 | import tensorflow as tf 27 | 28 | 29 | logger = logging.getLogger(__name__) 30 | 31 | 32 | @dataclass 33 | class TensorFlowBenchmarkArguments(BenchmarkArguments): 34 | tpu_name: str = field( 35 | default=None, metadata={"help": "Name of TPU"}, 36 | ) 37 | device_idx: int = field( 38 | default=0, metadata={"help": "CPU / GPU device index. Defaults to 0."}, 39 | ) 40 | eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."}) 41 | use_xla: bool = field( 42 | default=False, 43 | metadata={ 44 | "help": "Benchmark models using XLA JIT compilation. Note that `eager_model` has to be set to `False`." 45 | }, 46 | ) 47 | 48 | @cached_property 49 | @tf_required 50 | def _setup_tpu(self) -> Tuple["tf.distribute.cluster_resolver.TPUClusterResolver"]: 51 | if not self.no_tpu: 52 | try: 53 | if self.tpu_name: 54 | tpu = tf.distribute.cluster_resolver.TPUClusterResolver(self.tpu_name) 55 | else: 56 | tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 57 | except ValueError: 58 | tpu = None 59 | return tpu 60 | 61 | @cached_property 62 | @tf_required 63 | def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", "tf.distribute.cluster_resolver.TPUClusterResolver"]: 64 | if self.is_tpu: 65 | tf.config.experimental_connect_to_cluster(self._setup_tpu) 66 | tf.tpu.experimental.initialize_tpu_system(self._setup_tpu) 67 | 68 | strategy = tf.distribute.experimental.TPUStrategy(self._setup_tpu) 69 | else: 70 | # currently no multi gpu is allowed 71 | if self.is_gpu: 72 | # TODO: Currently only single GPU is supported 73 | tf.config.experimental.set_visible_devices(self.gpu_list[self.device_idx], "GPU") 74 | strategy = tf.distribute.OneDeviceStrategy(device=f"/gpu:{self.device_idx}") 75 | else: 76 | tf.config.experimental.set_visible_devices([], "GPU") # disable GPU 77 | strategy = tf.distribute.OneDeviceStrategy(device=f"/cpu:{self.device_idx}") 78 | 79 | return strategy 80 | 81 | @property 82 | @tf_required 83 | def is_tpu(self) -> bool: 84 | return self._setup_tpu is not None 85 | 86 | @property 87 | @tf_required 88 | def strategy(self) -> "tf.distribute.Strategy": 89 | return self._setup_strategy 90 | 91 | @property 92 | @tf_required 93 | def gpu_list(self): 94 | return tf.config.list_physical_devices("GPU") 95 | 96 | @property 97 | @tf_required 98 | def n_gpu(self) -> int: 99 | if not self.no_cuda: 100 | return len(self.gpu_list) 101 | return 0 102 | 103 | @property 104 | def is_gpu(self) -> bool: 105 | return self.n_gpu > 0 106 | -------------------------------------------------------------------------------- /transformers/tokenization_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for DistilBERT.""" 16 | 17 | 18 | import logging 19 | 20 | from .tokenization_bert import BertTokenizer, BertTokenizerFast 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 26 | 27 | PRETRAINED_VOCAB_FILES_MAP = { 28 | "vocab_file": { 29 | "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 30 | "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", 31 | "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", 32 | "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", 33 | "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt", 34 | "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", 35 | } 36 | } 37 | 38 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 39 | "distilbert-base-uncased": 512, 40 | "distilbert-base-uncased-distilled-squad": 512, 41 | "distilbert-base-cased": 512, 42 | "distilbert-base-cased-distilled-squad": 512, 43 | "distilbert-base-german-cased": 512, 44 | "distilbert-base-multilingual-cased": 512, 45 | } 46 | 47 | 48 | PRETRAINED_INIT_CONFIGURATION = { 49 | "distilbert-base-uncased": {"do_lower_case": True}, 50 | "distilbert-base-uncased-distilled-squad": {"do_lower_case": True}, 51 | "distilbert-base-cased": {"do_lower_case": False}, 52 | "distilbert-base-cased-distilled-squad": {"do_lower_case": False}, 53 | "distilbert-base-german-cased": {"do_lower_case": False}, 54 | "distilbert-base-multilingual-cased": {"do_lower_case": False}, 55 | } 56 | 57 | 58 | class DistilBertTokenizer(BertTokenizer): 59 | r""" 60 | Constructs a DistilBertTokenizer. 61 | 62 | :class:`~transformers.DistilBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 63 | tokenization: punctuation splitting + wordpiece. 64 | 65 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 66 | parameters. 67 | """ 68 | 69 | vocab_files_names = VOCAB_FILES_NAMES 70 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 71 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 72 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 73 | model_input_names = ["attention_mask"] 74 | 75 | 76 | class DistilBertTokenizerFast(BertTokenizerFast): 77 | r""" 78 | Constructs a "Fast" DistilBertTokenizer (backed by HuggingFace's `tokenizers` library). 79 | 80 | :class:`~transformers.DistilBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end 81 | tokenization: punctuation splitting + wordpiece. 82 | 83 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning 84 | parameters. 85 | """ 86 | 87 | vocab_files_names = VOCAB_FILES_NAMES 88 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 89 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 90 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 91 | model_input_names = ["attention_mask"] 92 | -------------------------------------------------------------------------------- /transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | import os 21 | 22 | import torch 23 | 24 | from transformers import ( 25 | CONFIG_NAME, 26 | WEIGHTS_NAME, 27 | XLNetConfig, 28 | XLNetForQuestionAnswering, 29 | XLNetForSequenceClassification, 30 | XLNetLMHeadModel, 31 | load_tf_weights_in_xlnet, 32 | ) 33 | 34 | 35 | GLUE_TASKS_NUM_LABELS = { 36 | "cola": 2, 37 | "mnli": 3, 38 | "mrpc": 2, 39 | "sst-2": 2, 40 | "sts-b": 1, 41 | "qqp": 2, 42 | "qnli": 2, 43 | "rte": 2, 44 | "wnli": 2, 45 | } 46 | 47 | 48 | logging.basicConfig(level=logging.INFO) 49 | 50 | 51 | def convert_xlnet_checkpoint_to_pytorch( 52 | tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None 53 | ): 54 | # Initialise PyTorch model 55 | config = XLNetConfig.from_json_file(bert_config_file) 56 | 57 | finetuning_task = finetuning_task.lower() if finetuning_task is not None else "" 58 | if finetuning_task in GLUE_TASKS_NUM_LABELS: 59 | print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config))) 60 | config.finetuning_task = finetuning_task 61 | config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task] 62 | model = XLNetForSequenceClassification(config) 63 | elif "squad" in finetuning_task: 64 | config.finetuning_task = finetuning_task 65 | model = XLNetForQuestionAnswering(config) 66 | else: 67 | model = XLNetLMHeadModel(config) 68 | 69 | # Load weights from tf checkpoint 70 | load_tf_weights_in_xlnet(model, config, tf_checkpoint_path) 71 | 72 | # Save pytorch-model 73 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) 74 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) 75 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) 76 | torch.save(model.state_dict(), pytorch_weights_dump_path) 77 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) 78 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 79 | f.write(config.to_json_string()) 80 | 81 | 82 | if __name__ == "__main__": 83 | parser = argparse.ArgumentParser() 84 | # Required parameters 85 | parser.add_argument( 86 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 87 | ) 88 | parser.add_argument( 89 | "--xlnet_config_file", 90 | default=None, 91 | type=str, 92 | required=True, 93 | help="The config json file corresponding to the pre-trained XLNet model. \n" 94 | "This specifies the model architecture.", 95 | ) 96 | parser.add_argument( 97 | "--pytorch_dump_folder_path", 98 | default=None, 99 | type=str, 100 | required=True, 101 | help="Path to the folder to store the PyTorch model or dataset/vocab.", 102 | ) 103 | parser.add_argument( 104 | "--finetuning_task", 105 | default=None, 106 | type=str, 107 | help="Name of a task on which the XLNet TensorFloaw model was fine-tuned", 108 | ) 109 | args = parser.parse_args() 110 | print(args) 111 | 112 | convert_xlnet_checkpoint_to_pytorch( 113 | args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task 114 | ) 115 | -------------------------------------------------------------------------------- /transformers/data/datasets/language_modeling.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pickle 4 | import time 5 | 6 | import torch 7 | from filelock import FileLock 8 | from torch.utils.data.dataset import Dataset 9 | 10 | from ...tokenization_utils import PreTrainedTokenizer 11 | 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class TextDataset(Dataset): 17 | """ 18 | This will be superseded by a framework-agnostic approach 19 | soon. 20 | """ 21 | 22 | def __init__( 23 | self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, 24 | ): 25 | assert os.path.isfile(file_path) 26 | 27 | block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False) 28 | 29 | directory, filename = os.path.split(file_path) 30 | cached_features_file = os.path.join( 31 | directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,), 32 | ) 33 | 34 | # Make sure only the first process in distributed training processes the dataset, 35 | # and the others will use the cache. 36 | lock_path = cached_features_file + ".lock" 37 | with FileLock(lock_path): 38 | 39 | if os.path.exists(cached_features_file) and not overwrite_cache: 40 | start = time.time() 41 | with open(cached_features_file, "rb") as handle: 42 | self.examples = pickle.load(handle) 43 | logger.info( 44 | f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start 45 | ) 46 | 47 | else: 48 | logger.info(f"Creating features from dataset file at {directory}") 49 | 50 | self.examples = [] 51 | with open(file_path, encoding="utf-8") as f: 52 | text = f.read() 53 | 54 | tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) 55 | 56 | for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size 57 | self.examples.append( 58 | tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]) 59 | ) 60 | # Note that we are losing the last truncated example here for the sake of simplicity (no padding) 61 | # If your dataset is small, first you should loook for a bigger one :-) and second you 62 | # can change this behavior by adding (model specific) padding. 63 | 64 | start = time.time() 65 | with open(cached_features_file, "wb") as handle: 66 | pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) 67 | logger.info( 68 | "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start 69 | ) 70 | 71 | def __len__(self): 72 | return len(self.examples) 73 | 74 | def __getitem__(self, i) -> torch.Tensor: 75 | return torch.tensor(self.examples[i], dtype=torch.long) 76 | 77 | 78 | class LineByLineTextDataset(Dataset): 79 | """ 80 | This will be superseded by a framework-agnostic approach 81 | soon. 82 | """ 83 | 84 | def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, shuffle=False): 85 | assert os.path.isfile(file_path) 86 | # Here, we do not cache the features, operating under the assumption 87 | # that we will soon use fast multithreaded tokenizers from the 88 | # `tokenizers` repo everywhere =) 89 | logger.info("Creating features from dataset file at %s", file_path) 90 | 91 | with open(file_path, encoding="utf-8") as f: 92 | lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] 93 | #if shuffle: 94 | # import random 95 | # random.shuffle(lines) 96 | self.shuffle = shuffle 97 | 98 | batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size) 99 | self.examples = batch_encoding["input_ids"] 100 | 101 | def __len__(self): 102 | return len(self.examples) 103 | 104 | def __getitem__(self, i) -> torch.Tensor: 105 | if self.shuffle and i % len(self.examples) == 0: 106 | import random 107 | random.shuffle(self.examples) 108 | 109 | return torch.tensor(self.examples[i], dtype=torch.long) 110 | -------------------------------------------------------------------------------- /transformers/convert_bert_pytorch_checkpoint_to_original_tf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" 17 | 18 | import argparse 19 | import os 20 | 21 | import numpy as np 22 | import tensorflow as tf 23 | import torch 24 | 25 | from transformers import BertModel 26 | 27 | 28 | def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): 29 | 30 | """ 31 | :param model:BertModel Pytorch model instance to be converted 32 | :param ckpt_dir: Tensorflow model directory 33 | :param model_name: model name 34 | :return: 35 | 36 | Currently supported HF models: 37 | Y BertModel 38 | N BertForMaskedLM 39 | N BertForPreTraining 40 | N BertForMultipleChoice 41 | N BertForNextSentencePrediction 42 | N BertForSequenceClassification 43 | N BertForQuestionAnswering 44 | """ 45 | 46 | tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") 47 | 48 | var_map = ( 49 | ("layer.", "layer_"), 50 | ("word_embeddings.weight", "word_embeddings"), 51 | ("position_embeddings.weight", "position_embeddings"), 52 | ("token_type_embeddings.weight", "token_type_embeddings"), 53 | (".", "/"), 54 | ("LayerNorm/weight", "LayerNorm/gamma"), 55 | ("LayerNorm/bias", "LayerNorm/beta"), 56 | ("weight", "kernel"), 57 | ) 58 | 59 | if not os.path.isdir(ckpt_dir): 60 | os.makedirs(ckpt_dir) 61 | 62 | state_dict = model.state_dict() 63 | 64 | def to_tf_var_name(name: str): 65 | for patt, repl in iter(var_map): 66 | name = name.replace(patt, repl) 67 | return "bert/{}".format(name) 68 | 69 | def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): 70 | tf_dtype = tf.dtypes.as_dtype(tensor.dtype) 71 | tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) 72 | session.run(tf.variables_initializer([tf_var])) 73 | session.run(tf_var) 74 | return tf_var 75 | 76 | tf.reset_default_graph() 77 | with tf.Session() as session: 78 | for var_name in state_dict: 79 | tf_name = to_tf_var_name(var_name) 80 | torch_tensor = state_dict[var_name].numpy() 81 | if any([x in var_name for x in tensors_to_transpose]): 82 | torch_tensor = torch_tensor.T 83 | tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) 84 | tf.keras.backend.set_value(tf_var, torch_tensor) 85 | tf_weight = session.run(tf_var) 86 | print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor))) 87 | 88 | saver = tf.train.Saver(tf.trainable_variables()) 89 | saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) 90 | 91 | 92 | def main(raw_args=None): 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased") 95 | parser.add_argument( 96 | "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model" 97 | ) 98 | parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/.bin") 99 | parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") 100 | args = parser.parse_args(raw_args) 101 | 102 | model = BertModel.from_pretrained( 103 | pretrained_model_name_or_path=args.model_name, 104 | state_dict=torch.load(args.pytorch_model_path), 105 | cache_dir=args.cache_dir, 106 | ) 107 | 108 | convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name) 109 | 110 | 111 | if __name__ == "__main__": 112 | main() 113 | -------------------------------------------------------------------------------- /transformers/configuration_encoder_decoder.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import copy 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | class EncoderDecoderConfig(PretrainedConfig): 27 | r""" 28 | :class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`. 29 | 30 | It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs. 31 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` 32 | and can be used to control the model outputs. 33 | See the documentation for :class:`~transformers.PretrainedConfig` for more information. 34 | 35 | Args: 36 | kwargs (`optional`): 37 | Remaining dictionary of keyword arguments. Notably: 38 | encoder (:class:`PretrainedConfig`, optional, defaults to `None`): 39 | An instance of a configuration object that defines the encoder config. 40 | decoder (:class:`PretrainedConfig`, optional, defaults to `None`): 41 | An instance of a configuration object that defines the decoder config. 42 | 43 | Example:: 44 | 45 | >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel 46 | 47 | >>> # Initializing a BERT bert-base-uncased style configuration 48 | >>> config_encoder = BertConfig() 49 | >>> config_decoder = BertConfig() 50 | 51 | >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) 52 | 53 | >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations 54 | >>> model = EncoderDecoderModel(config=config) 55 | 56 | >>> # Accessing the model configuration 57 | >>> config_encoder = model.config.encoder 58 | >>> config_decoder = model.config.decoder 59 | """ 60 | model_type = "encoder_decoder" 61 | 62 | def __init__(self, **kwargs): 63 | super().__init__(**kwargs) 64 | assert ( 65 | "encoder" in kwargs and "decoder" in kwargs 66 | ), "Config has to be initialized with encoder and decoder config" 67 | encoder_config = kwargs.pop("encoder") 68 | encoder_model_type = encoder_config.pop("model_type") 69 | decoder_config = kwargs.pop("decoder") 70 | decoder_model_type = decoder_config.pop("model_type") 71 | 72 | from transformers import AutoConfig 73 | 74 | self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config) 75 | self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config) 76 | self.is_encoder_decoder = True 77 | 78 | @classmethod 79 | def from_encoder_decoder_configs( 80 | cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig 81 | ) -> PretrainedConfig: 82 | r""" 83 | Instantiate a :class:`~transformers.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model configuration and decoder model configuration. 84 | 85 | Returns: 86 | :class:`EncoderDecoderConfig`: An instance of a configuration object 87 | """ 88 | logger.info("Set `config.is_decoder=True` for decoder_config") 89 | decoder_config.is_decoder = True 90 | 91 | return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict()) 92 | 93 | def to_dict(self): 94 | """ 95 | Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig`. 96 | 97 | Returns: 98 | :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, 99 | """ 100 | output = copy.deepcopy(self.__dict__) 101 | output["encoder"] = self.encoder.to_dict() 102 | output["decoder"] = self.decoder.to_dict() 103 | output["model_type"] = self.__class__.model_type 104 | return output 105 | -------------------------------------------------------------------------------- /transformers/configuration_t5.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2010, The T5 Authors and HuggingFace Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ T5 model configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | T5_PRETRAINED_CONFIG_ARCHIVE_MAP = { 26 | "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json", 27 | "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json", 28 | "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json", 29 | "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json", 30 | "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json", 31 | } 32 | 33 | 34 | class T5Config(PretrainedConfig): 35 | r""" 36 | :class:`~transformers.T5Config` is the configuration class to store the configuration of a 37 | `T5Model`. 38 | 39 | 40 | Arguments: 41 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`. 42 | d_model: Size of the encoder layers and the pooler layer. `d_model` can also accesed via the property `hidden_size`. 43 | num_layers: Number of hidden layers in the Transformer encoder. `num_layers` can also be accessed via the property `num_hidden_layers`. 44 | d_kv: Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`. 45 | d_ff: Size of the intermediate feed forward layer in each `T5Block`. 46 | num_heads: Number of attention heads for each attention layer in 47 | the Transformer encoder. `num_heads` can also be accessed via the property `num_attention_heads`. 48 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 49 | layer in the Transformer encoder. 50 | hidden_act: The non-linear activation function (function or string) in the 51 | encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. 52 | hidden_dropout_prob: The dropout probabilitiy for all fully connected 53 | layers in the embeddings, encoder, and pooler. 54 | attention_probs_dropout_prob: The dropout ratio for the attention 55 | probabilities. 56 | n_positions: The maximum sequence length that this model might 57 | ever be used with. Typically set this to something large just in case 58 | (e.g., 512 or 1024 or 2048). `n_positions` can also be accessed via the property `max_position_embeddings`. 59 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 60 | `T5Model`. 61 | initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing). 62 | layer_norm_eps: The epsilon used by LayerNorm. 63 | """ 64 | model_type = "t5" 65 | 66 | def __init__( 67 | self, 68 | vocab_size=32128, 69 | n_positions=512, 70 | d_model=512, 71 | d_kv=64, 72 | d_ff=2048, 73 | num_layers=6, 74 | num_heads=8, 75 | relative_attention_num_buckets=32, 76 | dropout_rate=0.1, 77 | layer_norm_epsilon=1e-6, 78 | initializer_factor=1.0, 79 | is_encoder_decoder=True, 80 | pad_token_id=0, 81 | eos_token_id=1, 82 | **kwargs 83 | ): 84 | super().__init__( 85 | pad_token_id=pad_token_id, eos_token_id=eos_token_id, is_encoder_decoder=is_encoder_decoder, **kwargs, 86 | ) 87 | self.vocab_size = vocab_size 88 | self.n_positions = n_positions 89 | self.d_model = d_model 90 | self.d_kv = d_kv 91 | self.d_ff = d_ff 92 | self.num_layers = num_layers 93 | self.num_heads = num_heads 94 | self.relative_attention_num_buckets = relative_attention_num_buckets 95 | self.dropout_rate = dropout_rate 96 | self.layer_norm_epsilon = layer_norm_epsilon 97 | self.initializer_factor = initializer_factor 98 | 99 | @property 100 | def max_position_embeddings(self): 101 | return self.n_positions 102 | 103 | @property 104 | def hidden_size(self): 105 | return self.d_model 106 | 107 | @property 108 | def num_attention_heads(self): 109 | return self.num_heads 110 | 111 | @property 112 | def num_hidden_layers(self): 113 | return self.num_layers 114 | -------------------------------------------------------------------------------- /transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert Transformer XL checkpoint and datasets.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | import os 21 | import pickle 22 | import sys 23 | 24 | import torch 25 | 26 | import transformers.tokenization_transfo_xl as data_utils 27 | from transformers import ( 28 | CONFIG_NAME, 29 | WEIGHTS_NAME, 30 | TransfoXLConfig, 31 | TransfoXLLMHeadModel, 32 | load_tf_weights_in_transfo_xl, 33 | ) 34 | from transformers.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES 35 | 36 | 37 | logging.basicConfig(level=logging.INFO) 38 | 39 | # We do this to be able to load python 2 datasets pickles 40 | # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918 41 | data_utils.Vocab = data_utils.TransfoXLTokenizer 42 | data_utils.Corpus = data_utils.TransfoXLCorpus 43 | sys.modules["data_utils"] = data_utils 44 | sys.modules["vocabulary"] = data_utils 45 | 46 | 47 | def convert_transfo_xl_checkpoint_to_pytorch( 48 | tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file 49 | ): 50 | if transfo_xl_dataset_file: 51 | # Convert a pre-processed corpus (see original TensorFlow repo) 52 | with open(transfo_xl_dataset_file, "rb") as fp: 53 | corpus = pickle.load(fp, encoding="latin1") 54 | # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) 55 | pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"] 56 | print("Save vocabulary to {}".format(pytorch_vocab_dump_path)) 57 | corpus_vocab_dict = corpus.vocab.__dict__ 58 | torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) 59 | 60 | corpus_dict_no_vocab = corpus.__dict__ 61 | corpus_dict_no_vocab.pop("vocab", None) 62 | pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME 63 | print("Save dataset to {}".format(pytorch_dataset_dump_path)) 64 | torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) 65 | 66 | if tf_checkpoint_path: 67 | # Convert a pre-trained TensorFlow model 68 | config_path = os.path.abspath(transfo_xl_config_file) 69 | tf_path = os.path.abspath(tf_checkpoint_path) 70 | 71 | print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path)) 72 | # Initialise PyTorch model 73 | if transfo_xl_config_file == "": 74 | config = TransfoXLConfig() 75 | else: 76 | config = TransfoXLConfig.from_json_file(transfo_xl_config_file) 77 | print("Building PyTorch model from configuration: {}".format(str(config))) 78 | model = TransfoXLLMHeadModel(config) 79 | 80 | model = load_tf_weights_in_transfo_xl(model, config, tf_path) 81 | # Save pytorch-model 82 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) 83 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) 84 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) 85 | torch.save(model.state_dict(), pytorch_weights_dump_path) 86 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) 87 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 88 | f.write(config.to_json_string()) 89 | 90 | 91 | if __name__ == "__main__": 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument( 94 | "--pytorch_dump_folder_path", 95 | default=None, 96 | type=str, 97 | required=True, 98 | help="Path to the folder to store the PyTorch model or dataset/vocab.", 99 | ) 100 | parser.add_argument( 101 | "--tf_checkpoint_path", 102 | default="", 103 | type=str, 104 | help="An optional path to a TensorFlow checkpoint path to be converted.", 105 | ) 106 | parser.add_argument( 107 | "--transfo_xl_config_file", 108 | default="", 109 | type=str, 110 | help="An optional config json file corresponding to the pre-trained BERT model. \n" 111 | "This specifies the model architecture.", 112 | ) 113 | parser.add_argument( 114 | "--transfo_xl_dataset_file", 115 | default="", 116 | type=str, 117 | help="An optional dataset file to be converted in a vocabulary.", 118 | ) 119 | args = parser.parse_args() 120 | convert_transfo_xl_checkpoint_to_pytorch( 121 | args.tf_checkpoint_path, 122 | args.transfo_xl_config_file, 123 | args.pytorch_dump_folder_path, 124 | args.transfo_xl_dataset_file, 125 | ) 126 | -------------------------------------------------------------------------------- /transformers/modeling_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 Inria, Facebook AI Research and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """PyTorch CamemBERT model. """ 17 | 18 | import logging 19 | 20 | from .configuration_camembert import CamembertConfig 21 | from .file_utils import add_start_docstrings 22 | from .modeling_roberta import ( 23 | RobertaForMaskedLM, 24 | RobertaForMultipleChoice, 25 | RobertaForQuestionAnswering, 26 | RobertaForSequenceClassification, 27 | RobertaForTokenClassification, 28 | RobertaModel, 29 | ) 30 | 31 | 32 | logger = logging.getLogger(__name__) 33 | 34 | _TOKENIZER_FOR_DOC = "CamembertTokenizer" 35 | 36 | CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ 37 | "camembert-base", 38 | "Musixmatch/umberto-commoncrawl-cased-v1", 39 | "Musixmatch/umberto-wikipedia-uncased-v1", 40 | # See all CamemBERT models at https://huggingface.co/models?filter=camembert 41 | ] 42 | 43 | CAMEMBERT_START_DOCSTRING = r""" 44 | 45 | This model is a PyTorch `torch.nn.Module `_ sub-class. 46 | Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general 47 | usage and behavior. 48 | 49 | Parameters: 50 | config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the 51 | model. Initializing with a config file does not load the weights associated with the model, only the 52 | configuration. 53 | Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. 54 | output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): 55 | If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. 56 | """ 57 | 58 | 59 | @add_start_docstrings( 60 | "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", 61 | CAMEMBERT_START_DOCSTRING, 62 | ) 63 | class CamembertModel(RobertaModel): 64 | """ 65 | This class overrides :class:`~transformers.RobertaModel`. Please check the 66 | superclass for the appropriate documentation alongside usage examples. 67 | """ 68 | 69 | config_class = CamembertConfig 70 | 71 | 72 | @add_start_docstrings( 73 | """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING, 74 | ) 75 | class CamembertForMaskedLM(RobertaForMaskedLM): 76 | """ 77 | This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the 78 | superclass for the appropriate documentation alongside usage examples. 79 | """ 80 | 81 | config_class = CamembertConfig 82 | 83 | 84 | @add_start_docstrings( 85 | """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer 86 | on top of the pooled output) e.g. for GLUE tasks. """, 87 | CAMEMBERT_START_DOCSTRING, 88 | ) 89 | class CamembertForSequenceClassification(RobertaForSequenceClassification): 90 | """ 91 | This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the 92 | superclass for the appropriate documentation alongside usage examples. 93 | """ 94 | 95 | config_class = CamembertConfig 96 | 97 | 98 | @add_start_docstrings( 99 | """CamemBERT Model with a multiple choice classification head on top (a linear layer on top of 100 | the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, 101 | CAMEMBERT_START_DOCSTRING, 102 | ) 103 | class CamembertForMultipleChoice(RobertaForMultipleChoice): 104 | """ 105 | This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the 106 | superclass for the appropriate documentation alongside usage examples. 107 | """ 108 | 109 | config_class = CamembertConfig 110 | 111 | 112 | @add_start_docstrings( 113 | """CamemBERT Model with a token classification head on top (a linear layer on top of 114 | the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, 115 | CAMEMBERT_START_DOCSTRING, 116 | ) 117 | class CamembertForTokenClassification(RobertaForTokenClassification): 118 | """ 119 | This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the 120 | superclass for the appropriate documentation alongside usage examples. 121 | """ 122 | 123 | config_class = CamembertConfig 124 | 125 | 126 | @add_start_docstrings( 127 | """CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD 128 | (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits` """, 129 | CAMEMBERT_START_DOCSTRING, 130 | ) 131 | class CamembertForQuestionAnswering(RobertaForQuestionAnswering): 132 | """ 133 | This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the 134 | superclass for the appropriate documentation alongside usage examples. 135 | """ 136 | 137 | config_class = CamembertConfig 138 | -------------------------------------------------------------------------------- /transformers/configuration_retribert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ RetriBERT model configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | # TODO: uploadto AWS 26 | RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "retribert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", 28 | } 29 | 30 | 31 | class RetriBertConfig(PretrainedConfig): 32 | r""" 33 | This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`. 34 | It is used to instantiate a RetriBertModel model according to the specified arguments, defining the model 35 | architecture. 36 | 37 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used 38 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` 39 | for more information. 40 | 41 | 42 | Args: 43 | vocab_size (:obj:`int`, optional, defaults to 30522): 44 | Vocabulary size of the BERT model. Defines the different tokens that 45 | can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`. 46 | hidden_size (:obj:`int`, optional, defaults to 768): 47 | Dimensionality of the encoder layers and the pooler layer. 48 | num_hidden_layers (:obj:`int`, optional, defaults to 12): 49 | Number of hidden layers in the Transformer encoder. 50 | num_attention_heads (:obj:`int`, optional, defaults to 12): 51 | Number of attention heads for each attention layer in the Transformer encoder. 52 | intermediate_size (:obj:`int`, optional, defaults to 3072): 53 | Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 54 | hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): 55 | The non-linear activation function (function or string) in the encoder and pooler. 56 | If string, "gelu", "relu", "swish" and "gelu_new" are supported. 57 | hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): 58 | The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. 59 | attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): 60 | The dropout ratio for the attention probabilities. 61 | max_position_embeddings (:obj:`int`, optional, defaults to 512): 62 | The maximum sequence length that this model might ever be used with. 63 | Typically set this to something large just in case (e.g., 512 or 1024 or 2048). 64 | type_vocab_size (:obj:`int`, optional, defaults to 2): 65 | The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`. 66 | initializer_range (:obj:`float`, optional, defaults to 0.02): 67 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 68 | layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): 69 | The epsilon used by the layer normalization layers. 70 | share_encoders (:obj:`bool`, optional, defaults to True): 71 | Whether to use the same Bert-type encoder for the queries and document 72 | projection_dim (:obj:`int`, optional, defaults to 128): 73 | Final dimension of the query and document representation after projection 74 | 75 | """ 76 | model_type = "retribert" 77 | 78 | def __init__( 79 | self, 80 | vocab_size=30522, 81 | hidden_size=768, 82 | num_hidden_layers=8, 83 | num_attention_heads=12, 84 | intermediate_size=3072, 85 | hidden_act="gelu", 86 | hidden_dropout_prob=0.1, 87 | attention_probs_dropout_prob=0.1, 88 | max_position_embeddings=512, 89 | type_vocab_size=2, 90 | initializer_range=0.02, 91 | layer_norm_eps=1e-12, 92 | share_encoders=True, 93 | projection_dim=128, 94 | pad_token_id=0, 95 | **kwargs 96 | ): 97 | super().__init__(pad_token_id=pad_token_id, **kwargs) 98 | 99 | self.vocab_size = vocab_size 100 | self.hidden_size = hidden_size 101 | self.num_hidden_layers = num_hidden_layers 102 | self.num_attention_heads = num_attention_heads 103 | self.hidden_act = hidden_act 104 | self.intermediate_size = intermediate_size 105 | self.hidden_dropout_prob = hidden_dropout_prob 106 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 107 | self.max_position_embeddings = max_position_embeddings 108 | self.type_vocab_size = type_vocab_size 109 | self.initializer_range = initializer_range 110 | self.layer_norm_eps = layer_norm_eps 111 | self.share_encoders = share_encoders 112 | self.projection_dim = projection_dim 113 | -------------------------------------------------------------------------------- /transformers/modeling_xlm_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """PyTorch XLM-RoBERTa model. """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_xlm_roberta import XLMRobertaConfig 22 | from .file_utils import add_start_docstrings 23 | from .modeling_roberta import ( 24 | RobertaForMaskedLM, 25 | RobertaForMultipleChoice, 26 | RobertaForQuestionAnswering, 27 | RobertaForSequenceClassification, 28 | RobertaForTokenClassification, 29 | RobertaModel, 30 | ) 31 | 32 | 33 | logger = logging.getLogger(__name__) 34 | 35 | XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ 36 | "xlm-roberta-base", 37 | "xlm-roberta-large", 38 | "xlm-roberta-large-finetuned-conll02-dutch", 39 | "xlm-roberta-large-finetuned-conll02-spanish", 40 | "xlm-roberta-large-finetuned-conll03-english", 41 | "xlm-roberta-large-finetuned-conll03-german", 42 | # See all XLM-RoBERTa models at https://huggingface.co/models?filter=xlm-roberta 43 | ] 44 | 45 | 46 | XLM_ROBERTA_START_DOCSTRING = r""" 47 | 48 | This model is a PyTorch `torch.nn.Module `_ sub-class. 49 | Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general 50 | usage and behavior. 51 | 52 | Parameters: 53 | config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the 54 | model. Initializing with a config file does not load the weights associated with the model, only the configuration. 55 | Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. 56 | output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): 57 | If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. 58 | """ 59 | 60 | 61 | @add_start_docstrings( 62 | "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", 63 | XLM_ROBERTA_START_DOCSTRING, 64 | ) 65 | class XLMRobertaModel(RobertaModel): 66 | """ 67 | This class overrides :class:`~transformers.RobertaModel`. Please check the 68 | superclass for the appropriate documentation alongside usage examples. 69 | """ 70 | 71 | config_class = XLMRobertaConfig 72 | 73 | 74 | @add_start_docstrings( 75 | """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING, 76 | ) 77 | class XLMRobertaForMaskedLM(RobertaForMaskedLM): 78 | """ 79 | This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the 80 | superclass for the appropriate documentation alongside usage examples. 81 | """ 82 | 83 | config_class = XLMRobertaConfig 84 | 85 | 86 | @add_start_docstrings( 87 | """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 88 | on top of the pooled output) e.g. for GLUE tasks. """, 89 | XLM_ROBERTA_START_DOCSTRING, 90 | ) 91 | class XLMRobertaForSequenceClassification(RobertaForSequenceClassification): 92 | """ 93 | This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the 94 | superclass for the appropriate documentation alongside usage examples. 95 | """ 96 | 97 | config_class = XLMRobertaConfig 98 | 99 | 100 | @add_start_docstrings( 101 | """XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of 102 | the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, 103 | XLM_ROBERTA_START_DOCSTRING, 104 | ) 105 | class XLMRobertaForMultipleChoice(RobertaForMultipleChoice): 106 | """ 107 | This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the 108 | superclass for the appropriate documentation alongside usage examples. 109 | """ 110 | 111 | config_class = XLMRobertaConfig 112 | 113 | 114 | @add_start_docstrings( 115 | """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of 116 | the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, 117 | XLM_ROBERTA_START_DOCSTRING, 118 | ) 119 | class XLMRobertaForTokenClassification(RobertaForTokenClassification): 120 | """ 121 | This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the 122 | superclass for the appropriate documentation alongside usage examples. 123 | """ 124 | 125 | config_class = XLMRobertaConfig 126 | 127 | 128 | @add_start_docstrings( 129 | """XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a 130 | linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""", 131 | XLM_ROBERTA_START_DOCSTRING, 132 | ) 133 | class XLMRobertaForQuestionAnswering(RobertaForQuestionAnswering): 134 | """ 135 | This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the 136 | superclass for the appropriate documentation alongside usage examples. 137 | """ 138 | 139 | config_class = XLMRobertaConfig 140 | -------------------------------------------------------------------------------- /transformers/data/datasets/glue.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | from dataclasses import dataclass, field 5 | from enum import Enum 6 | from typing import List, Optional, Union 7 | 8 | import torch 9 | from filelock import FileLock 10 | from torch.utils.data.dataset import Dataset 11 | 12 | from ...tokenization_bart import BartTokenizer, BartTokenizerFast 13 | from ...tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast 14 | from ...tokenization_utils import PreTrainedTokenizer 15 | from ...tokenization_xlm_roberta import XLMRobertaTokenizer 16 | from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors 17 | from ..processors.utils import InputFeatures 18 | 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | @dataclass 24 | class GlueDataTrainingArguments: 25 | """ 26 | Arguments pertaining to what data we are going to input our model for training and eval. 27 | 28 | Using `HfArgumentParser` we can turn this class 29 | into argparse arguments to be able to specify them on 30 | the command line. 31 | """ 32 | 33 | task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())}) 34 | data_dir: str = field( 35 | metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."} 36 | ) 37 | max_seq_length: int = field( 38 | default=128, 39 | metadata={ 40 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 41 | "than this will be truncated, sequences shorter will be padded." 42 | }, 43 | ) 44 | overwrite_cache: bool = field( 45 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 46 | ) 47 | 48 | def __post_init__(self): 49 | self.task_name = self.task_name.lower() 50 | 51 | 52 | class Split(Enum): 53 | train = "train" 54 | dev = "dev" 55 | test = "test" 56 | 57 | 58 | class GlueDataset(Dataset): 59 | """ 60 | This will be superseded by a framework-agnostic approach 61 | soon. 62 | """ 63 | 64 | args: GlueDataTrainingArguments 65 | output_mode: str 66 | features: List[InputFeatures] 67 | 68 | def __init__( 69 | self, 70 | args: GlueDataTrainingArguments, 71 | tokenizer: PreTrainedTokenizer, 72 | limit_length: Optional[int] = None, 73 | mode: Union[str, Split] = Split.train, 74 | cache_dir: Optional[str] = None, 75 | ): 76 | self.args = args 77 | self.processor = glue_processors[args.task_name]() 78 | self.output_mode = glue_output_modes[args.task_name] 79 | if isinstance(mode, str): 80 | try: 81 | mode = Split[mode] 82 | except KeyError: 83 | raise KeyError("mode is not a valid split name") 84 | # Load data features from cache or dataset file 85 | cached_features_file = os.path.join( 86 | cache_dir if cache_dir is not None else args.data_dir, 87 | "cached_{}_{}_{}_{}".format( 88 | mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name, 89 | ), 90 | ) 91 | label_list = self.processor.get_labels() 92 | if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in ( 93 | RobertaTokenizer, 94 | RobertaTokenizerFast, 95 | XLMRobertaTokenizer, 96 | BartTokenizer, 97 | BartTokenizerFast, 98 | ): 99 | # HACK(label indices are swapped in RoBERTa pretrained model) 100 | label_list[1], label_list[2] = label_list[2], label_list[1] 101 | self.label_list = label_list 102 | 103 | # Make sure only the first process in distributed training processes the dataset, 104 | # and the others will use the cache. 105 | lock_path = cached_features_file + ".lock" 106 | with FileLock(lock_path): 107 | 108 | if os.path.exists(cached_features_file) and not args.overwrite_cache: 109 | start = time.time() 110 | self.features = torch.load(cached_features_file) 111 | logger.info( 112 | f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start 113 | ) 114 | else: 115 | logger.info(f"Creating features from dataset file at {args.data_dir}") 116 | 117 | if mode == Split.dev: 118 | examples = self.processor.get_dev_examples(args.data_dir) 119 | elif mode == Split.test: 120 | examples = self.processor.get_test_examples(args.data_dir) 121 | else: 122 | examples = self.processor.get_train_examples(args.data_dir) 123 | if limit_length is not None: 124 | examples = examples[:limit_length] 125 | self.features = glue_convert_examples_to_features( 126 | examples, 127 | tokenizer, 128 | max_length=args.max_seq_length, 129 | label_list=label_list, 130 | output_mode=self.output_mode, 131 | ) 132 | start = time.time() 133 | torch.save(self.features, cached_features_file) 134 | # ^ This seems to take a lot of time so I want to investigate why and how we can improve. 135 | logger.info( 136 | "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start 137 | ) 138 | 139 | def __len__(self): 140 | return len(self.features) 141 | 142 | def __getitem__(self, i) -> InputFeatures: 143 | return self.features[i] 144 | 145 | def get_labels(self): 146 | return self.label_list 147 | -------------------------------------------------------------------------------- /transformers/benchmark/benchmark_args_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import dataclasses 18 | import json 19 | import logging 20 | from dataclasses import dataclass, field 21 | from time import time 22 | from typing import List 23 | 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | def list_field(default=None, metadata=None): 29 | return field(default_factory=lambda: default, metadata=metadata) 30 | 31 | 32 | @dataclass 33 | class BenchmarkArguments: 34 | """ 35 | BenchMarkArguments are arguments we use in our benchmark scripts 36 | **which relate to the training loop itself**. 37 | 38 | Using `HfArgumentParser` we can turn this class 39 | into argparse arguments to be able to specify them on 40 | the command line. 41 | """ 42 | 43 | models: List[str] = list_field( 44 | default=[], 45 | metadata={ 46 | "help": "Model checkpoints to be provided to the AutoModel classes. Leave blank to benchmark the base version of all available models" 47 | }, 48 | ) 49 | 50 | batch_sizes: List[int] = list_field( 51 | default=[8], metadata={"help": "List of batch sizes for which memory and time performance will be evaluated"} 52 | ) 53 | 54 | sequence_lengths: List[int] = list_field( 55 | default=[8, 32, 128, 512], 56 | metadata={"help": "List of sequence lengths for which memory and time performance will be evaluated"}, 57 | ) 58 | 59 | no_inference: bool = field(default=False, metadata={"help": "Don't benchmark inference of model"}) 60 | no_cuda: bool = field(default=False, metadata={"help": "Whether to run on available cuda devices"}) 61 | no_tpu: bool = field(default=False, metadata={"help": "Whether to run on available tpu devices"}) 62 | fp16: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."}) 63 | training: bool = field(default=False, metadata={"help": "Benchmark training of model"}) 64 | verbose: bool = field(default=False, metadata={"help": "Verbose memory tracing"}) 65 | no_speed: bool = field(default=False, metadata={"help": "Don't perform speed measurments"}) 66 | no_memory: bool = field(default=False, metadata={"help": "Don't perform memory measurments"}) 67 | trace_memory_line_by_line: bool = field(default=False, metadata={"help": "Trace memory line by line"}) 68 | save_to_csv: bool = field(default=False, metadata={"help": "Save result to a CSV file"}) 69 | log_print: bool = field(default=False, metadata={"help": "Save all print statements in a log file"}) 70 | no_env_print: bool = field(default=False, metadata={"help": "Don't print environment information"}) 71 | no_multi_process: bool = field( 72 | default=False, 73 | metadata={ 74 | "help": "Don't use multiprocessing for memory and speed measurement. It is highly recommended to use multiprocessing for accurate CPU and GPU memory measurements. This option should only be used for debugging / testing and on TPU." 75 | }, 76 | ) 77 | inference_time_csv_file: str = field( 78 | default=f"inference_time_{round(time())}.csv", 79 | metadata={"help": "CSV filename used if saving time results to csv."}, 80 | ) 81 | inference_memory_csv_file: str = field( 82 | default=f"inference_memory_{round(time())}.csv", 83 | metadata={"help": "CSV filename used if saving memory results to csv."}, 84 | ) 85 | train_time_csv_file: str = field( 86 | default=f"train_time_{round(time())}.csv", 87 | metadata={"help": "CSV filename used if saving time results to csv for training."}, 88 | ) 89 | train_memory_csv_file: str = field( 90 | default=f"train_memory_{round(time())}.csv", 91 | metadata={"help": "CSV filename used if saving memory results to csv for training."}, 92 | ) 93 | env_info_csv_file: str = field( 94 | default=f"env_info_{round(time())}.csv", 95 | metadata={"help": "CSV filename used if saving environment information."}, 96 | ) 97 | log_filename: str = field( 98 | default=f"log_{round(time())}.csv", 99 | metadata={"help": "Log filename used if print statements are saved in log."}, 100 | ) 101 | repeat: int = field(default=3, metadata={"help": "Times an experiment will be run."}) 102 | only_pretrain_model: bool = field( 103 | default=False, 104 | metadata={ 105 | "help": "Instead of loading the model as defined in `config.architectures` if exists, just load the pretrain model weights." 106 | }, 107 | ) 108 | 109 | def to_json_string(self): 110 | """ 111 | Serializes this instance to a JSON string. 112 | """ 113 | return json.dumps(dataclasses.asdict(self), indent=2) 114 | 115 | @property 116 | def model_names(self): 117 | assert ( 118 | len(self.models) > 0 119 | ), "Please make sure you provide at least one model name / model identifier, *e.g.* `--models bert-base-cased` or `args.models = ['bert-base-cased']." 120 | return self.models 121 | 122 | @property 123 | def do_multi_processing(self): 124 | if self.no_multi_process: 125 | return False 126 | elif self.is_tpu: 127 | logger.info("Multiprocessing is currently not possible on TPU.") 128 | return False 129 | else: 130 | return True 131 | -------------------------------------------------------------------------------- /transformers/configuration_ctrl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Salesforce and HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Salesforce CTRL configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.huggingface.co/bert/ctrl-config.json"} 26 | 27 | 28 | class CTRLConfig(PretrainedConfig): 29 | """ 30 | This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`. 31 | It is used to instantiate an CTRL model according to the specified arguments, defining the model 32 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of 33 | the `ctrl `__ architecture from SalesForce. 34 | 35 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used 36 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` 37 | for more information. 38 | 39 | Args: 40 | vocab_size (:obj:`int`, optional, defaults to 246534): 41 | Vocabulary size of the CTRL model. Defines the different tokens that 42 | can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`. 43 | n_positions (:obj:`int`, optional, defaults to 256): 44 | The maximum sequence length that this model might ever be used with. 45 | Typically set this to something large just in case (e.g., 512 or 1024 or 2048). 46 | n_ctx (:obj:`int`, optional, defaults to 256): 47 | Dimensionality of the causal mask (usually same as n_positions). 48 | n_embd (:obj:`int`, optional, defaults to 1280): 49 | Dimensionality of the embeddings and hidden states. 50 | dff (:obj:`int`, optional, defaults to 8192): 51 | Dimensionality of the inner dimension of the FFN. 52 | n_layer (:obj:`int`, optional, defaults to 48): 53 | Number of hidden layers in the Transformer encoder. 54 | n_head (:obj:`int`, optional, defaults to 16): 55 | Number of attention heads for each attention layer in the Transformer encoder. 56 | resid_pdrop (:obj:`float`, optional, defaults to 0.1): 57 | The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. 58 | embd_pdrop (:obj:`int`, optional, defaults to 0.1): 59 | The dropout ratio for the embeddings. 60 | attn_pdrop (:obj:`float`, optional, defaults to 0.1): 61 | The dropout ratio for the attention. 62 | layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6): 63 | The epsilon to use in the layer normalization layers 64 | initializer_range (:obj:`float`, optional, defaults to 0.02): 65 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 66 | 67 | Example:: 68 | 69 | >>> from transformers import CTRLModel, CTRLConfig 70 | 71 | >>> # Initializing a CTRL configuration 72 | >>> configuration = CTRLConfig() 73 | 74 | >>> # Initializing a model from the configuration 75 | >>> model = CTRLModel(configuration) 76 | 77 | >>> # Accessing the model configuration 78 | >>> configuration = model.config 79 | """ 80 | 81 | model_type = "ctrl" 82 | 83 | def __init__( 84 | self, 85 | vocab_size=246534, 86 | n_positions=256, 87 | n_ctx=256, 88 | n_embd=1280, 89 | dff=8192, 90 | n_layer=48, 91 | n_head=16, 92 | resid_pdrop=0.1, 93 | embd_pdrop=0.1, 94 | attn_pdrop=0.1, 95 | layer_norm_epsilon=1e-6, 96 | initializer_range=0.02, 97 | summary_type="cls_index", 98 | summary_use_proj=True, 99 | summary_activation=None, 100 | summary_proj_to_labels=True, 101 | summary_first_dropout=0.1, 102 | **kwargs 103 | ): 104 | super().__init__(**kwargs) 105 | self.vocab_size = vocab_size 106 | self.n_ctx = n_ctx 107 | self.n_positions = n_positions 108 | self.n_embd = n_embd 109 | self.n_layer = n_layer 110 | self.n_head = n_head 111 | self.dff = dff 112 | self.resid_pdrop = resid_pdrop 113 | self.embd_pdrop = embd_pdrop 114 | self.attn_pdrop = attn_pdrop 115 | self.layer_norm_epsilon = layer_norm_epsilon 116 | self.initializer_range = initializer_range 117 | 118 | self.summary_type = summary_type 119 | self.summary_use_proj = summary_use_proj 120 | self.summary_activation = summary_activation 121 | self.summary_first_dropout = summary_first_dropout 122 | self.summary_proj_to_labels = summary_proj_to_labels 123 | 124 | @property 125 | def max_position_embeddings(self): 126 | return self.n_positions 127 | 128 | @property 129 | def hidden_size(self): 130 | return self.n_embd 131 | 132 | @property 133 | def num_attention_heads(self): 134 | return self.n_head 135 | 136 | @property 137 | def num_hidden_layers(self): 138 | return self.n_layer 139 | -------------------------------------------------------------------------------- /transformers/tokenization_flaubert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for Flaubert, based on XLM.""" 16 | 17 | 18 | import logging 19 | import unicodedata 20 | 21 | import six 22 | 23 | from .tokenization_xlm import XLMTokenizer 24 | 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | VOCAB_FILES_NAMES = { 29 | "vocab_file": "vocab.json", 30 | "merges_file": "merges.txt", 31 | } 32 | 33 | PRETRAINED_VOCAB_FILES_MAP = { 34 | "vocab_file": { 35 | "flaubert/flaubert_small_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/vocab.json", 36 | "flaubert/flaubert_base_uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/vocab.json", 37 | "flaubert/flaubert_base_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/vocab.json", 38 | "flaubert/flaubert_large_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/vocab.json", 39 | }, 40 | "merges_file": { 41 | "flaubert/flaubert_small_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/merges.txt", 42 | "flaubert/flaubert_base_uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/merges.txt", 43 | "flaubert/flaubert_base_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/merges.txt", 44 | "flaubert/flaubert_large_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/merges.txt", 45 | }, 46 | } 47 | 48 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 49 | "flaubert/flaubert_small_cased": 512, 50 | "flaubert/flaubert_base_uncased": 512, 51 | "flaubert/flaubert_base_cased": 512, 52 | "flaubert/flaubert_large_cased": 512, 53 | } 54 | 55 | PRETRAINED_INIT_CONFIGURATION = { 56 | "flaubert/flaubert_small_cased": {"do_lowercase": False}, 57 | "flaubert/flaubert_base_uncased": {"do_lowercase": True}, 58 | "flaubert/flaubert_base_cased": {"do_lowercase": False}, 59 | "flaubert/flaubert_large_cased": {"do_lowercase": False}, 60 | } 61 | 62 | 63 | def convert_to_unicode(text): 64 | """ 65 | Converts `text` to Unicode (if it's not already), assuming UTF-8 input. 66 | """ 67 | # six_ensure_text is copied from https://github.com/benjaminp/six 68 | def six_ensure_text(s, encoding="utf-8", errors="strict"): 69 | if isinstance(s, six.binary_type): 70 | return s.decode(encoding, errors) 71 | elif isinstance(s, six.text_type): 72 | return s 73 | else: 74 | raise TypeError("not expecting type '%s'" % type(s)) 75 | 76 | return six_ensure_text(text, encoding="utf-8", errors="ignore") 77 | 78 | 79 | class FlaubertTokenizer(XLMTokenizer): 80 | """ 81 | BPE tokenizer for Flaubert 82 | 83 | - Moses preprocessing & tokenization 84 | - Normalize all inputs text 85 | - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ 86 | (ex: "__classify__") to a vocabulary 87 | - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies) 88 | 89 | This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples 90 | and documentation regarding arguments. 91 | """ 92 | 93 | vocab_files_names = VOCAB_FILES_NAMES 94 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 95 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 96 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 97 | 98 | def __init__(self, do_lowercase=False, **kwargs): 99 | super().__init__(**kwargs) 100 | self.do_lowercase = do_lowercase 101 | self.do_lowercase_and_remove_accent = False 102 | 103 | def preprocess_text(self, text): 104 | text = text.replace("``", '"').replace("''", '"') 105 | text = convert_to_unicode(text) 106 | text = unicodedata.normalize("NFC", text) 107 | 108 | if self.do_lowercase: 109 | text = text.lower() 110 | 111 | return text 112 | 113 | def _tokenize(self, text, bypass_tokenizer=False): 114 | """ 115 | Tokenize a string given language code using Moses. 116 | 117 | Details of tokenization: 118 | - [sacremoses](https://github.com/alvations/sacremoses): port of Moses 119 | - Install with `pip install sacremoses` 120 | 121 | Args: 122 | - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) (bool). If True, we only apply BPE. 123 | 124 | Returns: 125 | List of tokens. 126 | """ 127 | lang = "fr" 128 | if lang and self.lang2id and lang not in self.lang2id: 129 | logger.error( 130 | "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model." 131 | ) 132 | 133 | if bypass_tokenizer: 134 | text = text.split() 135 | else: 136 | text = self.preprocess_text(text) 137 | text = self.moses_pipeline(text, lang=lang) 138 | text = self.moses_tokenize(text, lang=lang) 139 | 140 | split_tokens = [] 141 | for token in text: 142 | if token: 143 | split_tokens.extend([t for t in self.bpe(token).split(" ")]) 144 | 145 | return split_tokens 146 | -------------------------------------------------------------------------------- /transformers/configuration_bart.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ BART configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | BART_PRETRAINED_CONFIG_ARCHIVE_MAP = { 26 | "facebook/bart-base": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-base/config.json", 27 | "facebook/bart-large": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large/config.json", 28 | "facebook/bart-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-mnli/config.json", 29 | "facebook/bart-large-cnn": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/config.json", 30 | "facebook/bart-large-xsum": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-xsum/config.json", 31 | "facebook/mbart-large-en-ro": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/mbart-large-en-ro/config.json", 32 | "yjernite/bart_eli5": "https://s3.amazonaws.com/models.huggingface.co/bert/yjernite/bart_eli5/config.json", 33 | } 34 | 35 | 36 | class BartConfig(PretrainedConfig): 37 | r""" 38 | Configuration class for Bart. Parameters are renamed from the fairseq implementation 39 | """ 40 | model_type = "bart" 41 | 42 | def __init__( 43 | self, 44 | activation_dropout=0.0, 45 | extra_pos_embeddings=2, 46 | activation_function="gelu", 47 | vocab_size=50265, 48 | d_model=1024, 49 | encoder_ffn_dim=4096, 50 | encoder_layers=12, 51 | encoder_attention_heads=16, 52 | decoder_ffn_dim=4096, 53 | decoder_layers=12, 54 | decoder_attention_heads=16, 55 | encoder_layerdrop=0.0, 56 | decoder_layerdrop=0.0, 57 | attention_dropout=0.0, 58 | dropout=0.1, 59 | max_position_embeddings=1024, 60 | init_std=0.02, 61 | classifier_dropout=0.0, 62 | num_labels=3, 63 | is_encoder_decoder=True, 64 | pad_token_id=1, 65 | bos_token_id=0, 66 | eos_token_id=2, 67 | normalize_before=False, 68 | add_final_layer_norm=False, 69 | scale_embedding=False, 70 | normalize_embedding=True, 71 | static_position_embeddings=False, 72 | add_bias_logits=False, 73 | **common_kwargs 74 | ): 75 | r""" 76 | :class:`~transformers.BartConfig` is the configuration class for `BartModel`. 77 | 78 | Examples:: 79 | 80 | >>> from transformers import BartConfig, BartModel 81 | 82 | >>> config = BartConfig.from_pretrained('facebook/bart-large') 83 | >>> model = BartModel(config) 84 | """ 85 | if "hidden_size" in common_kwargs: 86 | raise ValueError("hidden size is called d_model") 87 | super().__init__( 88 | num_labels=num_labels, 89 | pad_token_id=pad_token_id, 90 | bos_token_id=bos_token_id, 91 | eos_token_id=eos_token_id, 92 | is_encoder_decoder=is_encoder_decoder, 93 | **common_kwargs, 94 | ) 95 | self.vocab_size = vocab_size 96 | self.d_model = d_model # encoder_embed_dim and decoder_embed_dim 97 | self.encoder_ffn_dim = encoder_ffn_dim 98 | self.encoder_layers = self.num_hidden_layers = encoder_layers 99 | self.encoder_attention_heads = encoder_attention_heads 100 | self.encoder_layerdrop = encoder_layerdrop 101 | self.decoder_layerdrop = decoder_layerdrop 102 | self.decoder_ffn_dim = decoder_ffn_dim 103 | self.decoder_layers = decoder_layers 104 | self.decoder_attention_heads = decoder_attention_heads 105 | self.max_position_embeddings = max_position_embeddings 106 | self.init_std = init_std # Normal(0, this parameter) 107 | self.activation_function = activation_function 108 | 109 | # Params introduced for Mbart 110 | self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True 111 | self.normalize_embedding = normalize_embedding # True for mbart, False otherwise 112 | self.normalize_before = normalize_before # combo of fairseq's encoder_ and decoder_normalize_before 113 | self.add_final_layer_norm = add_final_layer_norm 114 | 115 | # Params introduced for Marian 116 | self.add_bias_logits = add_bias_logits 117 | self.static_position_embeddings = static_position_embeddings 118 | 119 | # 3 Types of Dropout 120 | self.attention_dropout = attention_dropout 121 | self.activation_dropout = activation_dropout 122 | self.dropout = dropout 123 | 124 | # Classifier stuff 125 | self.classif_dropout = classifier_dropout 126 | 127 | # pos embedding offset 128 | self.extra_pos_embeddings = self.pad_token_id + 1 129 | 130 | @property 131 | def num_attention_heads(self) -> int: 132 | return self.encoder_attention_heads 133 | 134 | @property 135 | def hidden_size(self) -> int: 136 | return self.d_model 137 | 138 | def is_valid_mbart(self) -> bool: 139 | """Is the configuration aligned with the MBART paper.""" 140 | if self.normalize_before and self.add_final_layer_norm and self.scale_embedding: 141 | return True 142 | if self.normalize_before or self.add_final_layer_norm or self.scale_embedding: 143 | logger.info("This configuration is a mixture of MBART and BART settings") 144 | return False 145 | 146 | 147 | class MBartConfig(BartConfig): 148 | model_type = "mbart" 149 | -------------------------------------------------------------------------------- /transformers/commands/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | from argparse import ArgumentParser, Namespace 3 | from logging import getLogger 4 | 5 | from transformers import SingleSentenceClassificationProcessor as Processor 6 | from transformers import TextClassificationPipeline, is_tf_available, is_torch_available 7 | from transformers.commands import BaseTransformersCLICommand 8 | 9 | 10 | if not is_tf_available() and not is_torch_available(): 11 | raise RuntimeError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training") 12 | 13 | # TF training parameters 14 | USE_XLA = False 15 | USE_AMP = False 16 | 17 | 18 | def train_command_factory(args: Namespace): 19 | """ 20 | Factory function used to instantiate serving server from provided command line arguments. 21 | :return: ServeCommand 22 | """ 23 | return TrainCommand(args) 24 | 25 | 26 | class TrainCommand(BaseTransformersCLICommand): 27 | @staticmethod 28 | def register_subcommand(parser: ArgumentParser): 29 | """ 30 | Register this command to argparse so it's available for the transformer-cli 31 | :param parser: Root parser to register command-specific arguments 32 | :return: 33 | """ 34 | train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.") 35 | 36 | train_parser.add_argument( 37 | "--train_data", 38 | type=str, 39 | required=True, 40 | help="path to train (and optionally evaluation) dataset as a csv with " 41 | "tab separated labels and sentences.", 42 | ) 43 | train_parser.add_argument( 44 | "--column_label", type=int, default=0, help="Column of the dataset csv file with example labels." 45 | ) 46 | train_parser.add_argument( 47 | "--column_text", type=int, default=1, help="Column of the dataset csv file with example texts." 48 | ) 49 | train_parser.add_argument( 50 | "--column_id", type=int, default=2, help="Column of the dataset csv file with example ids." 51 | ) 52 | train_parser.add_argument( 53 | "--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)." 54 | ) 55 | 56 | train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.") 57 | train_parser.add_argument( 58 | "--validation_split", 59 | type=float, 60 | default=0.1, 61 | help="if validation dataset is not provided, fraction of train dataset " "to use as validation dataset.", 62 | ) 63 | 64 | train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.") 65 | 66 | train_parser.add_argument( 67 | "--task", type=str, default="text_classification", help="Task to train the model on." 68 | ) 69 | train_parser.add_argument( 70 | "--model", type=str, default="bert-base-uncased", help="Model's name or path to stored model." 71 | ) 72 | train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.") 73 | train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.") 74 | train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.") 75 | train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.") 76 | train_parser.set_defaults(func=train_command_factory) 77 | 78 | def __init__(self, args: Namespace): 79 | self.logger = getLogger("transformers-cli/training") 80 | 81 | self.framework = "tf" if is_tf_available() else "torch" 82 | 83 | os.makedirs(args.output, exist_ok=True) 84 | assert os.path.isdir(args.output) 85 | self.output = args.output 86 | 87 | self.column_label = args.column_label 88 | self.column_text = args.column_text 89 | self.column_id = args.column_id 90 | 91 | self.logger.info("Loading {} pipeline for {}".format(args.task, args.model)) 92 | if args.task == "text_classification": 93 | self.pipeline = TextClassificationPipeline.from_pretrained(args.model) 94 | elif args.task == "token_classification": 95 | raise NotImplementedError 96 | elif args.task == "question_answering": 97 | raise NotImplementedError 98 | 99 | self.logger.info("Loading dataset from {}".format(args.train_data)) 100 | self.train_dataset = Processor.create_from_csv( 101 | args.train_data, 102 | column_label=args.column_label, 103 | column_text=args.column_text, 104 | column_id=args.column_id, 105 | skip_first_row=args.skip_first_row, 106 | ) 107 | self.valid_dataset = None 108 | if args.validation_data: 109 | self.logger.info("Loading validation dataset from {}".format(args.validation_data)) 110 | self.valid_dataset = Processor.create_from_csv( 111 | args.validation_data, 112 | column_label=args.column_label, 113 | column_text=args.column_text, 114 | column_id=args.column_id, 115 | skip_first_row=args.skip_first_row, 116 | ) 117 | 118 | self.validation_split = args.validation_split 119 | self.train_batch_size = args.train_batch_size 120 | self.valid_batch_size = args.valid_batch_size 121 | self.learning_rate = args.learning_rate 122 | self.adam_epsilon = args.adam_epsilon 123 | 124 | def run(self): 125 | if self.framework == "tf": 126 | return self.run_tf() 127 | return self.run_torch() 128 | 129 | def run_torch(self): 130 | raise NotImplementedError 131 | 132 | def run_tf(self): 133 | self.pipeline.fit( 134 | self.train_dataset, 135 | validation_data=self.valid_dataset, 136 | validation_split=self.validation_split, 137 | learning_rate=self.learning_rate, 138 | adam_epsilon=self.adam_epsilon, 139 | train_batch_size=self.train_batch_size, 140 | valid_batch_size=self.valid_batch_size, 141 | ) 142 | 143 | # Save trained pipeline 144 | self.pipeline.save_pretrained(self.output) 145 | -------------------------------------------------------------------------------- /transformers/modeling_tf_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ TF 2.0 CamemBERT model. """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_camembert import CamembertConfig 22 | from .file_utils import add_start_docstrings 23 | from .modeling_tf_roberta import ( 24 | TFRobertaForMaskedLM, 25 | TFRobertaForMultipleChoice, 26 | TFRobertaForQuestionAnswering, 27 | TFRobertaForSequenceClassification, 28 | TFRobertaForTokenClassification, 29 | TFRobertaModel, 30 | ) 31 | 32 | 33 | logger = logging.getLogger(__name__) 34 | 35 | TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ 36 | # See all CamemBERT models at https://huggingface.co/models?filter=camembert 37 | ] 38 | 39 | 40 | CAMEMBERT_START_DOCSTRING = r""" 41 | 42 | .. note:: 43 | 44 | TF 2.0 models accepts two formats as inputs: 45 | 46 | - having all inputs as keyword arguments (like PyTorch models), or 47 | - having all inputs as a list, tuple or dict in the first positional arguments. 48 | 49 | This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having 50 | all the tensors in the first argument of the model call function: :obj:`model(inputs)`. 51 | 52 | If you choose this second option, there are three possibilities you can use to gather all the input Tensors 53 | in the first positional argument : 54 | 55 | - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` 56 | - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: 57 | :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` 58 | - a dictionary with one or several input Tensors associated to the input names given in the docstring: 59 | :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` 60 | 61 | Parameters: 62 | config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the 63 | model. Initializing with a config file does not load the weights associated with the model, only the configuration. 64 | Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. 65 | output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): 66 | If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. 67 | """ 68 | 69 | 70 | @add_start_docstrings( 71 | "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", 72 | CAMEMBERT_START_DOCSTRING, 73 | ) 74 | class TFCamembertModel(TFRobertaModel): 75 | """ 76 | This class overrides :class:`~transformers.TFRobertaModel`. Please check the 77 | superclass for the appropriate documentation alongside usage examples. 78 | """ 79 | 80 | config_class = CamembertConfig 81 | 82 | 83 | @add_start_docstrings( 84 | """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING, 85 | ) 86 | class TFCamembertForMaskedLM(TFRobertaForMaskedLM): 87 | """ 88 | This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the 89 | superclass for the appropriate documentation alongside usage examples. 90 | """ 91 | 92 | config_class = CamembertConfig 93 | 94 | 95 | @add_start_docstrings( 96 | """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer 97 | on top of the pooled output) e.g. for GLUE tasks. """, 98 | CAMEMBERT_START_DOCSTRING, 99 | ) 100 | class TFCamembertForSequenceClassification(TFRobertaForSequenceClassification): 101 | """ 102 | This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the 103 | superclass for the appropriate documentation alongside usage examples. 104 | """ 105 | 106 | config_class = CamembertConfig 107 | 108 | 109 | @add_start_docstrings( 110 | """CamemBERT Model with a token classification head on top (a linear layer on top of 111 | the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, 112 | CAMEMBERT_START_DOCSTRING, 113 | ) 114 | class TFCamembertForTokenClassification(TFRobertaForTokenClassification): 115 | """ 116 | This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the 117 | superclass for the appropriate documentation alongside usage examples. 118 | """ 119 | 120 | config_class = CamembertConfig 121 | 122 | 123 | @add_start_docstrings( 124 | """CamemBERT Model with a multiple choice classification head on top (a linear layer on top of 125 | the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, 126 | CAMEMBERT_START_DOCSTRING, 127 | ) 128 | class TFCamembertForMultipleChoice(TFRobertaForMultipleChoice): 129 | """ 130 | This class overrides :class:`~transformers.TFRobertaForMultipleChoice`. Please check the 131 | superclass for the appropriate documentation alongside usage examples. 132 | """ 133 | 134 | config_class = CamembertConfig 135 | 136 | 137 | @add_start_docstrings( 138 | """CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, 139 | CAMEMBERT_START_DOCSTRING, 140 | ) 141 | class TFCamembertForQuestionAnswering(TFRobertaForQuestionAnswering): 142 | """ 143 | This class overrides :class:`~transformers.TFRobertaForQuestionAnswering`. Please check the 144 | superclass for the appropriate documentation alongside usage examples. 145 | """ 146 | 147 | config_class = CamembertConfig 148 | -------------------------------------------------------------------------------- /transformers/modeling_tf_xlm_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ TF 2.0 XLM-RoBERTa model. """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_xlm_roberta import XLMRobertaConfig 22 | from .file_utils import add_start_docstrings 23 | from .modeling_tf_roberta import ( 24 | TFRobertaForMaskedLM, 25 | TFRobertaForMultipleChoice, 26 | TFRobertaForQuestionAnswering, 27 | TFRobertaForSequenceClassification, 28 | TFRobertaForTokenClassification, 29 | TFRobertaModel, 30 | ) 31 | 32 | 33 | logger = logging.getLogger(__name__) 34 | 35 | TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ 36 | # See all XLM-RoBERTa models at https://huggingface.co/models?filter=xlm-roberta 37 | ] 38 | 39 | 40 | XLM_ROBERTA_START_DOCSTRING = r""" 41 | 42 | .. note:: 43 | 44 | TF 2.0 models accepts two formats as inputs: 45 | 46 | - having all inputs as keyword arguments (like PyTorch models), or 47 | - having all inputs as a list, tuple or dict in the first positional arguments. 48 | 49 | This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having 50 | all the tensors in the first argument of the model call function: :obj:`model(inputs)`. 51 | 52 | If you choose this second option, there are three possibilities you can use to gather all the input Tensors 53 | in the first positional argument : 54 | 55 | - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` 56 | - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: 57 | :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` 58 | - a dictionary with one or several input Tensors associated to the input names given in the docstring: 59 | :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` 60 | 61 | Parameters: 62 | config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the 63 | model. Initializing with a config file does not load the weights associated with the model, only the configuration. 64 | Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. 65 | output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): 66 | If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. 67 | """ 68 | 69 | 70 | @add_start_docstrings( 71 | "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", 72 | XLM_ROBERTA_START_DOCSTRING, 73 | ) 74 | class TFXLMRobertaModel(TFRobertaModel): 75 | """ 76 | This class overrides :class:`~transformers.TFRobertaModel`. Please check the 77 | superclass for the appropriate documentation alongside usage examples. 78 | """ 79 | 80 | config_class = XLMRobertaConfig 81 | 82 | 83 | @add_start_docstrings( 84 | """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING, 85 | ) 86 | class TFXLMRobertaForMaskedLM(TFRobertaForMaskedLM): 87 | """ 88 | This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the 89 | superclass for the appropriate documentation alongside usage examples. 90 | """ 91 | 92 | config_class = XLMRobertaConfig 93 | 94 | 95 | @add_start_docstrings( 96 | """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 97 | on top of the pooled output) e.g. for GLUE tasks. """, 98 | XLM_ROBERTA_START_DOCSTRING, 99 | ) 100 | class TFXLMRobertaForSequenceClassification(TFRobertaForSequenceClassification): 101 | """ 102 | This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the 103 | superclass for the appropriate documentation alongside usage examples. 104 | """ 105 | 106 | config_class = XLMRobertaConfig 107 | 108 | 109 | @add_start_docstrings( 110 | """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of 111 | the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, 112 | XLM_ROBERTA_START_DOCSTRING, 113 | ) 114 | class TFXLMRobertaForTokenClassification(TFRobertaForTokenClassification): 115 | """ 116 | This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the 117 | superclass for the appropriate documentation alongside usage examples. 118 | """ 119 | 120 | config_class = XLMRobertaConfig 121 | 122 | 123 | @add_start_docstrings( 124 | """XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, 125 | XLM_ROBERTA_START_DOCSTRING, 126 | ) 127 | class TFXLMRobertaForQuestionAnswering(TFRobertaForQuestionAnswering): 128 | """ 129 | This class overrides :class:`~transformers.TFRobertaForQuestionAnsweringSimple`. Please check the 130 | superclass for the appropriate documentation alongside usage examples. 131 | """ 132 | 133 | config_class = XLMRobertaConfig 134 | 135 | 136 | @add_start_docstrings( 137 | """Roberta Model with a multiple choice classification head on top (a linear layer on top of 138 | the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, 139 | XLM_ROBERTA_START_DOCSTRING, 140 | ) 141 | class TFXLMRobertaForMultipleChoice(TFRobertaForMultipleChoice): 142 | """ 143 | This class overrides :class:`~transformers.TFRobertaForMultipleChoice`. Please check the 144 | superclass for the appropriate documentation alongside usage examples. 145 | """ 146 | 147 | config_class = XLMRobertaConfig 148 | -------------------------------------------------------------------------------- /transformers/convert_bart_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BART checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | import os 21 | from pathlib import Path 22 | 23 | import fairseq 24 | import torch 25 | from packaging import version 26 | 27 | from transformers import ( 28 | BartConfig, 29 | BartForConditionalGeneration, 30 | BartForSequenceClassification, 31 | BartModel, 32 | BartTokenizer, 33 | ) 34 | from transformers.modeling_bart import _make_linear_from_emb 35 | 36 | 37 | FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn", "bart_xsum/model.pt"] 38 | extra_arch = {"bart.large": BartModel, "bart.large.mnli": BartForSequenceClassification} 39 | if version.parse(fairseq.__version__) < version.parse("0.9.0"): 40 | raise Exception("requires fairseq >= 0.9.0") 41 | 42 | 43 | logging.basicConfig(level=logging.INFO) 44 | logger = logging.getLogger(__name__) 45 | 46 | SAMPLE_TEXT = " Hello world! cécé herlolip" 47 | 48 | mnli_rename_keys = [ 49 | ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"), 50 | ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"), 51 | ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"), 52 | ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"), 53 | ] 54 | 55 | 56 | def remove_ignore_keys_(state_dict): 57 | ignore_keys = [ 58 | "encoder.version", 59 | "decoder.version", 60 | "model.encoder.version", 61 | "model.decoder.version", 62 | "_float_tensor", 63 | ] 64 | for k in ignore_keys: 65 | state_dict.pop(k, None) 66 | 67 | 68 | def rename_key(dct, old, new): 69 | val = dct.pop(old) 70 | dct[new] = val 71 | 72 | 73 | def load_xsum_checkpoint(checkpoint_path): 74 | """Checkpoint path should end in model.pt""" 75 | sd = torch.load(checkpoint_path, map_location="cpu") 76 | hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval() 77 | hub_interface.model.load_state_dict(sd["model"]) 78 | return hub_interface 79 | 80 | 81 | def convert_checkpoint_from_disk(checkpoint_path, **config_kwargs): 82 | state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] 83 | remove_ignore_keys_(state_dict) 84 | vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0] 85 | state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] 86 | mbart_config = BartConfig(vocab_size=vocab_size, **config_kwargs) 87 | model = BartForConditionalGeneration(mbart_config) 88 | model.model.load_state_dict(state_dict) 89 | if hasattr(model, "lm_head"): 90 | model.lm_head = _make_linear_from_emb(model.model.shared) 91 | return model 92 | 93 | 94 | @torch.no_grad() 95 | def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None): 96 | """ 97 | Copy/paste/tweak model's weights to our BERT structure. 98 | """ 99 | if not os.path.exists(checkpoint_path): 100 | bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval() 101 | else: 102 | bart = load_xsum_checkpoint(checkpoint_path) 103 | 104 | bart.model.upgrade_state_dict(bart.model.state_dict()) 105 | if hf_checkpoint_name is None: 106 | hf_checkpoint_name = checkpoint_path.replace(".", "-") 107 | config = BartConfig.from_pretrained(hf_checkpoint_name) 108 | tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0) 109 | tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0) 110 | assert torch.eq(tokens, tokens2).all() 111 | 112 | if checkpoint_path == "bart.large.mnli": 113 | state_dict = bart.state_dict() 114 | remove_ignore_keys_(state_dict) 115 | state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"] 116 | for src, dest in mnli_rename_keys: 117 | rename_key(state_dict, src, dest) 118 | model = BartForSequenceClassification(config).eval() 119 | model.load_state_dict(state_dict) 120 | fairseq_output = bart.predict("mnli", tokens, return_logits=True) 121 | new_model_outputs = model(tokens)[0] # logits 122 | else: # no classification heads to worry about 123 | state_dict = bart.model.state_dict() 124 | remove_ignore_keys_(state_dict) 125 | state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] 126 | fairseq_output = bart.extract_features(tokens) 127 | if hf_checkpoint_name == "facebook/bart-large": 128 | model = BartModel(config).eval() 129 | model.load_state_dict(state_dict) 130 | new_model_outputs = model(tokens).model[0] 131 | else: 132 | model = BartForConditionalGeneration(config).eval() # an existing summarization ckpt 133 | model.model.load_state_dict(state_dict) 134 | if hasattr(model, "lm_head"): 135 | model.lm_head = _make_linear_from_emb(model.model.shared) 136 | new_model_outputs = model.model(tokens)[0] 137 | 138 | # Check results 139 | assert fairseq_output.shape == new_model_outputs.shape 140 | assert (fairseq_output == new_model_outputs).all().item() 141 | Path(pytorch_dump_folder_path).mkdir(exist_ok=True) 142 | model.save_pretrained(pytorch_dump_folder_path) 143 | 144 | 145 | if __name__ == "__main__": 146 | parser = argparse.ArgumentParser() 147 | # Required parameters 148 | parser.add_argument( 149 | "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem." 150 | ) 151 | parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") 152 | parser.add_argument( 153 | "--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum" 154 | ) 155 | args = parser.parse_args() 156 | convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config) 157 | -------------------------------------------------------------------------------- /transformers/configuration_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ DistilBERT model configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 26 | "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", 27 | "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json", 28 | "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json", 29 | "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-distilled-squad-config.json", 30 | "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json", 31 | "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json", 32 | "distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json", 33 | } 34 | 35 | 36 | class DistilBertConfig(PretrainedConfig): 37 | r""" 38 | This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`. 39 | It is used to instantiate a DistilBERT model according to the specified arguments, defining the model 40 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of 41 | the DistilBERT `distilbert-base-uncased `__ architecture. 42 | 43 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used 44 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` 45 | for more information. 46 | 47 | 48 | Args: 49 | vocab_size (:obj:`int`, optional, defaults to 30522): 50 | Vocabulary size of the DistilBERT model. Defines the different tokens that 51 | can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`. 52 | max_position_embeddings (:obj:`int`, optional, defaults to 512): 53 | The maximum sequence length that this model might ever be used with. 54 | Typically set this to something large just in case (e.g., 512 or 1024 or 2048). 55 | sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`): 56 | Whether to use sinusoidal positional embeddings. 57 | n_layers (:obj:`int`, optional, defaults to 6): 58 | Number of hidden layers in the Transformer encoder. 59 | n_heads (:obj:`int`, optional, defaults to 12): 60 | Number of attention heads for each attention layer in the Transformer encoder. 61 | dim (:obj:`int`, optional, defaults to 768): 62 | Dimensionality of the encoder layers and the pooler layer. 63 | hidden_dim (:obj:`int`, optional, defaults to 3072): 64 | The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 65 | dropout (:obj:`float`, optional, defaults to 0.1): 66 | The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. 67 | attention_dropout (:obj:`float`, optional, defaults to 0.1): 68 | The dropout ratio for the attention probabilities. 69 | activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): 70 | The non-linear activation function (function or string) in the encoder and pooler. 71 | If string, "gelu", "relu", "swish" and "gelu_new" are supported. 72 | initializer_range (:obj:`float`, optional, defaults to 0.02): 73 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 74 | qa_dropout (:obj:`float`, optional, defaults to 0.1): 75 | The dropout probabilities used in the question answering model 76 | :class:`~transformers.DistilBertForQuestionAnswering`. 77 | seq_classif_dropout (:obj:`float`, optional, defaults to 0.2): 78 | The dropout probabilities used in the sequence classification and the multiple choice model 79 | :class:`~transformers.DistilBertForSequenceClassification`. 80 | 81 | Example:: 82 | 83 | >>> from transformers import DistilBertModel, DistilBertConfig 84 | 85 | >>> # Initializing a DistilBERT configuration 86 | >>> configuration = DistilBertConfig() 87 | 88 | >>> # Initializing a model from the configuration 89 | >>> model = DistilBertModel(configuration) 90 | 91 | >>> # Accessing the model configuration 92 | >>> configuration = model.config 93 | """ 94 | model_type = "distilbert" 95 | 96 | def __init__( 97 | self, 98 | vocab_size=30522, 99 | max_position_embeddings=512, 100 | sinusoidal_pos_embds=False, 101 | n_layers=6, 102 | n_heads=12, 103 | dim=768, 104 | hidden_dim=4 * 768, 105 | dropout=0.1, 106 | attention_dropout=0.1, 107 | activation="gelu", 108 | initializer_range=0.02, 109 | qa_dropout=0.1, 110 | seq_classif_dropout=0.2, 111 | pad_token_id=0, 112 | **kwargs 113 | ): 114 | super().__init__(**kwargs, pad_token_id=pad_token_id) 115 | self.vocab_size = vocab_size 116 | self.max_position_embeddings = max_position_embeddings 117 | self.sinusoidal_pos_embds = sinusoidal_pos_embds 118 | self.n_layers = n_layers 119 | self.n_heads = n_heads 120 | self.dim = dim 121 | self.hidden_dim = hidden_dim 122 | self.dropout = dropout 123 | self.attention_dropout = attention_dropout 124 | self.activation = activation 125 | self.initializer_range = initializer_range 126 | self.qa_dropout = qa_dropout 127 | self.seq_classif_dropout = seq_classif_dropout 128 | 129 | @property 130 | def hidden_size(self): 131 | return self.dim 132 | 133 | @property 134 | def num_attention_heads(self): 135 | return self.n_heads 136 | 137 | @property 138 | def num_hidden_layers(self): 139 | return self.n_layers 140 | --------------------------------------------------------------------------------