├── tests ├── __init__.py ├── fixtures │ ├── empty.txt │ ├── input.txt │ ├── spiece.model │ ├── test_sentencepiece.model │ └── sample_text.txt ├── test_tokenization_utils.py ├── test_tokenization_distilbert.py ├── test_modeling_encoder_decoder.py ├── test_tokenization_auto.py ├── utils.py ├── test_configuration_common.py ├── test_tokenization_ctrl.py ├── test_tokenization_openai.py ├── test_tokenization_transfo_xl.py ├── test_tokenization_gpt2.py ├── test_tokenization_albert.py ├── test_model_card.py ├── test_tokenization_xlm.py ├── test_hf_api.py ├── test_optimization_tf.py ├── test_tokenization_t5.py ├── test_modeling_tf_auto.py ├── test_modeling_auto.py └── test_tokenization_roberta.py ├── MANIFEST.in ├── docs ├── source │ ├── examples.md │ ├── _static │ │ └── css │ │ │ ├── Calibre-Thin.otf │ │ │ ├── Calibre-Light.ttf │ │ │ ├── Calibre-Medium.otf │ │ │ ├── Calibre-Regular.otf │ │ │ └── code-snippets.css │ ├── imgs │ │ ├── transformers_logo_name.png │ │ ├── warmup_constant_schedule.png │ │ ├── warmup_cosine_schedule.png │ │ ├── warmup_linear_schedule.png │ │ ├── warmup_cosine_hard_restarts_schedule.png │ │ └── warmup_cosine_warm_restarts_schedule.png │ ├── main_classes │ │ ├── configuration.rst │ │ ├── model.rst │ │ ├── tokenizer.rst │ │ └── optimizer_schedules.rst │ ├── model_doc │ │ ├── transformerxl.rst │ │ ├── auto.rst │ │ ├── camembert.rst │ │ ├── gpt2.rst │ │ ├── ctrl.rst │ │ ├── roberta.rst │ │ ├── gpt.rst │ │ ├── albert.rst │ │ ├── xlm.rst │ │ ├── distilbert.rst │ │ ├── xlnet.rst │ │ └── bert.rst │ ├── model_sharing.md │ ├── bertology.rst │ ├── notebooks.rst │ ├── installation.md │ ├── benchmarks.md │ ├── multilingual.rst │ └── converting_tensorflow_models.rst ├── Makefile └── README.md ├── .DS_Store ├── examples ├── .DS_Store ├── generate_postag_and_labels.py └── configuration │ └── configuration_bert.py ├── Makefile ├── docker └── Dockerfile ├── src └── transformers │ ├── commands │ ├── __init__.py │ ├── download.py │ └── run.py │ ├── data │ ├── processors │ │ ├── __init__.py │ │ └── xnli.py │ ├── __init__.py │ └── metrics │ │ └── __init__.py │ ├── configuration_camembert.py │ ├── __main__.py │ ├── configuration_mmbt.py │ ├── configuration_roberta.py │ ├── configuration_xlm_roberta.py │ ├── convert_t5_original_tf_checkpoint_to_pytorch.py │ ├── convert_bert_original_tf_checkpoint_to_pytorch.py │ ├── convert_albert_original_tf_checkpoint_to_pytorch.py │ ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py │ ├── tokenization_distilbert.py │ ├── convert_openai_original_tf_checkpoint_to_pytorch.py │ ├── configuration_distilbert.py │ ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_kcnet_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py │ ├── configuration_t5.py │ ├── configuration_openai.py │ ├── convert_bert_pytorch_checkpoint_to_original_tf.py │ └── configuration_ctrl.py ├── templates ├── adding_a_new_example_script │ └── README.md └── adding_a_new_model │ ├── tests │ └── test_tokenization_xxx.py │ ├── convert_xxx_original_tf_checkpoint_to_pytorch.py │ ├── README.md │ └── configuration_xxx.py ├── setup.cfg ├── deploy_multi_version_doc.sh ├── transformers-cli ├── utils └── link_tester.py ├── valohai.yaml └── setup.py /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/fixtures/empty.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | -------------------------------------------------------------------------------- /docs/source/examples.md: -------------------------------------------------------------------------------- 1 | ../../examples/README.md -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi220/ggdp/HEAD/.DS_Store -------------------------------------------------------------------------------- /tests/fixtures/input.txt: -------------------------------------------------------------------------------- 1 | Who was Jim Henson ? ||| Jim Henson was a puppeteer 2 | -------------------------------------------------------------------------------- /examples/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi220/ggdp/HEAD/examples/.DS_Store -------------------------------------------------------------------------------- /tests/fixtures/spiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi220/ggdp/HEAD/tests/fixtures/spiece.model -------------------------------------------------------------------------------- /docs/source/_static/css/Calibre-Thin.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi220/ggdp/HEAD/docs/source/_static/css/Calibre-Thin.otf -------------------------------------------------------------------------------- /tests/fixtures/test_sentencepiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi220/ggdp/HEAD/tests/fixtures/test_sentencepiece.model -------------------------------------------------------------------------------- /docs/source/_static/css/Calibre-Light.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi220/ggdp/HEAD/docs/source/_static/css/Calibre-Light.ttf -------------------------------------------------------------------------------- /docs/source/_static/css/Calibre-Medium.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi220/ggdp/HEAD/docs/source/_static/css/Calibre-Medium.otf -------------------------------------------------------------------------------- /docs/source/_static/css/Calibre-Regular.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi220/ggdp/HEAD/docs/source/_static/css/Calibre-Regular.otf -------------------------------------------------------------------------------- /docs/source/imgs/transformers_logo_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi220/ggdp/HEAD/docs/source/imgs/transformers_logo_name.png -------------------------------------------------------------------------------- /docs/source/imgs/warmup_constant_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi220/ggdp/HEAD/docs/source/imgs/warmup_constant_schedule.png -------------------------------------------------------------------------------- /docs/source/imgs/warmup_cosine_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi220/ggdp/HEAD/docs/source/imgs/warmup_cosine_schedule.png -------------------------------------------------------------------------------- /docs/source/imgs/warmup_linear_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi220/ggdp/HEAD/docs/source/imgs/warmup_linear_schedule.png -------------------------------------------------------------------------------- /docs/source/imgs/warmup_cosine_hard_restarts_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi220/ggdp/HEAD/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png -------------------------------------------------------------------------------- /docs/source/imgs/warmup_cosine_warm_restarts_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi220/ggdp/HEAD/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: style 2 | 3 | style: 4 | black --line-length 119 examples templates tests src utils 5 | isort --recursive examples templates tests src utils 6 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:latest 2 | 3 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext 4 | 5 | RUN pip install transformers 6 | 7 | WORKDIR /workspace -------------------------------------------------------------------------------- /docs/source/_static/css/code-snippets.css: -------------------------------------------------------------------------------- 1 | 2 | .highlight .c1, .highlight .sd{ 3 | color: #999 4 | } 5 | 6 | .highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc { 7 | color: #FB8D68; 8 | } 9 | 10 | .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow { 11 | color: #6670FF; 12 | } -------------------------------------------------------------------------------- /src/transformers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from argparse import ArgumentParser 3 | 4 | 5 | class BaseTransformersCLICommand(ABC): 6 | @staticmethod 7 | @abstractmethod 8 | def register_subcommand(parser: ArgumentParser): 9 | raise NotImplementedError() 10 | 11 | @abstractmethod 12 | def run(self): 13 | raise NotImplementedError() 14 | -------------------------------------------------------------------------------- /docs/source/main_classes/configuration.rst: -------------------------------------------------------------------------------- 1 | Configuration 2 | ---------------------------------------------------- 3 | 4 | The base class ``PretrainedConfig`` implements the common methods for loading/saving a configuration either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository). 5 | 6 | ``PretrainedConfig`` 7 | ~~~~~~~~~~~~~~~~~~~~~ 8 | 9 | .. autoclass:: transformers.PretrainedConfig 10 | :members: 11 | -------------------------------------------------------------------------------- /templates/adding_a_new_example_script/README.md: -------------------------------------------------------------------------------- 1 | # How to add a new example script in 🤗Transformers 2 | 3 | This folder provide a template for adding a new example script implementing a training or inference task with the models in the 🤗Transformers library. 4 | 5 | Currently only examples for PyTorch are provided which are adaptations of the library's SQuAD examples which implement single-GPU and distributed training with gradient accumulation and mixed-precision (using NVIDIA's apex library) to cover a reasonable range of use cases. 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | ensure_newline_before_comments = True 3 | force_grid_wrap = 0 4 | include_trailing_comma = True 5 | known_first_party = transformers 6 | known_third_party = 7 | fairseq 8 | fastprogress 9 | git 10 | MeCab 11 | nltk 12 | packaging 13 | PIL 14 | psutil 15 | seqeval 16 | sklearn 17 | tensorboardX 18 | tensorflow_datasets 19 | torchtext 20 | torchvision 21 | 22 | line_length = 119 23 | lines_after_imports = 2 24 | multi_line_output = 3 25 | use_parentheses = True 26 | 27 | [flake8] 28 | ignore = E203, E501, F841, W503 29 | max-line-length = 119 30 | -------------------------------------------------------------------------------- /src/transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels 6 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features 7 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor 8 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels 9 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /deploy_multi_version_doc.sh: -------------------------------------------------------------------------------- 1 | cd docs 2 | 3 | function deploy_doc(){ 4 | echo "Creating doc at commit $1 and pushing to folder $2" 5 | git checkout $1 6 | if [ ! -z "$2" ] 7 | then 8 | echo "Pushing version" $2 9 | make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2 10 | else 11 | echo "Pushing master" 12 | make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir 13 | fi 14 | } 15 | 16 | deploy_doc "master" 17 | deploy_doc "b33a385" v1.0.0 18 | deploy_doc "fe02e45" v1.1.0 19 | deploy_doc "89fd345" v1.2.0 20 | deploy_doc "fc9faa8" v2.0.0 21 | deploy_doc "3ddce1d" v2.1.1 22 | deploy_doc "f2f3294" v2.2.0 23 | deploy_doc "d0f8b9a" v2.3.0 24 | -------------------------------------------------------------------------------- /docs/source/main_classes/model.rst: -------------------------------------------------------------------------------- 1 | Models 2 | ---------------------------------------------------- 3 | 4 | The base class ``PreTrainedModel`` implements the common methods for loading/saving a model either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository). 5 | 6 | ``PreTrainedModel`` also implements a few methods which are common among all the models to: 7 | 8 | - resize the input token embeddings when new tokens are added to the vocabulary 9 | - prune the attention heads of the model. 10 | 11 | ``PreTrainedModel`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.PreTrainedModel 15 | :members: 16 | 17 | ``TFPreTrainedModel`` 18 | ~~~~~~~~~~~~~~~~~~~~~ 19 | 20 | .. autoclass:: transformers.TFPreTrainedModel 21 | :members: 22 | -------------------------------------------------------------------------------- /src/transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .metrics import is_sklearn_available 6 | from .processors import ( 7 | DataProcessor, 8 | InputExample, 9 | InputFeatures, 10 | SingleSentenceClassificationProcessor, 11 | SquadExample, 12 | SquadFeatures, 13 | SquadV1Processor, 14 | SquadV2Processor, 15 | glue_convert_examples_to_features, 16 | glue_output_modes, 17 | glue_processors, 18 | glue_tasks_num_labels, 19 | squad_convert_examples_to_features, 20 | xnli_output_modes, 21 | xnli_processors, 22 | xnli_tasks_num_labels, 23 | ) 24 | 25 | 26 | if is_sklearn_available(): 27 | from .metrics import glue_compute_metrics, xnli_compute_metrics 28 | -------------------------------------------------------------------------------- /docs/source/main_classes/tokenizer.rst: -------------------------------------------------------------------------------- 1 | Tokenizer 2 | ---------------------------------------------------- 3 | 4 | The base class ``PreTrainedTokenizer`` implements the common methods for loading/saving a tokenizer either from a local file or directory, or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository). 5 | 6 | ``PreTrainedTokenizer`` is the main entry point into tokenizers as it also implements the main methods for using all the tokenizers: 7 | 8 | - tokenizing, converting tokens to ids and back and encoding/decoding, 9 | - adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...), 10 | - managing special tokens (adding them, assigning them to roles, making sure they are not split during tokenization) 11 | 12 | ``PreTrainedTokenizer`` 13 | ~~~~~~~~~~~~~~~~~~~~~~~~ 14 | 15 | .. autoclass:: transformers.PreTrainedTokenizer 16 | :members: 17 | -------------------------------------------------------------------------------- /docs/source/model_doc/transformerxl.rst: -------------------------------------------------------------------------------- 1 | Transformer XL 2 | ---------------------------------------------------- 3 | 4 | 5 | ``TransfoXLConfig`` 6 | ~~~~~~~~~~~~~~~~~~~~~ 7 | 8 | .. autoclass:: transformers.TransfoXLConfig 9 | :members: 10 | 11 | 12 | ``TransfoXLTokenizer`` 13 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 14 | 15 | .. autoclass:: transformers.TransfoXLTokenizer 16 | :members: 17 | 18 | 19 | ``TransfoXLModel`` 20 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 21 | 22 | .. autoclass:: transformers.TransfoXLModel 23 | :members: 24 | 25 | 26 | ``TransfoXLLMHeadModel`` 27 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 28 | 29 | .. autoclass:: transformers.TransfoXLLMHeadModel 30 | :members: 31 | 32 | 33 | ``TFTransfoXLModel`` 34 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 35 | 36 | .. autoclass:: transformers.TFTransfoXLModel 37 | :members: 38 | 39 | 40 | ``TFTransfoXLLMHeadModel`` 41 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 42 | 43 | .. autoclass:: transformers.TFTransfoXLLMHeadModel 44 | :members: 45 | -------------------------------------------------------------------------------- /docs/source/model_doc/auto.rst: -------------------------------------------------------------------------------- 1 | AutoModels 2 | ----------- 3 | 4 | In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method. 5 | 6 | AutoClasses are here to do this job for you so that you automatically retreive the relevant model given the name/path to the pretrained weights/config/vocabulary: 7 | 8 | Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``). 9 | 10 | 11 | ``AutoConfig`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.AutoConfig 15 | :members: 16 | 17 | 18 | ``AutoModel`` 19 | ~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.AutoModel 22 | :members: 23 | 24 | 25 | ``AutoTokenizer`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.AutoTokenizer 29 | :members: 30 | -------------------------------------------------------------------------------- /transformers-cli: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from argparse import ArgumentParser 3 | 4 | from transformers.commands.download import DownloadCommand 5 | from transformers.commands.run import RunCommand 6 | from transformers.commands.user import UserCommands 7 | from transformers.commands.convert import ConvertCommand 8 | from transformers.commands.serving import ServeCommand 9 | 10 | if __name__ == '__main__': 11 | parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli []') 12 | commands_parser = parser.add_subparsers(help='transformers-cli command helpers') 13 | 14 | # Register commands 15 | ConvertCommand.register_subcommand(commands_parser) 16 | DownloadCommand.register_subcommand(commands_parser) 17 | RunCommand.register_subcommand(commands_parser) 18 | ServeCommand.register_subcommand(commands_parser) 19 | UserCommands.register_subcommand(commands_parser) 20 | 21 | # Let's go 22 | args = parser.parse_args() 23 | 24 | if not hasattr(args, 'func'): 25 | parser.print_help() 26 | exit(1) 27 | 28 | # Run 29 | service = args.func(args) 30 | service.run() 31 | -------------------------------------------------------------------------------- /docs/source/model_doc/camembert.rst: -------------------------------------------------------------------------------- 1 | CamemBERT 2 | ---------------------------------------------------- 3 | 4 | ``CamembertConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.CamembertConfig 8 | :members: 9 | 10 | 11 | ``CamembertTokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.CamembertTokenizer 15 | :members: 16 | 17 | 18 | ``CamembertModel`` 19 | ~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.CamembertModel 22 | :members: 23 | 24 | 25 | ``CamembertForMaskedLM`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.CamembertForMaskedLM 29 | :members: 30 | 31 | 32 | ``CamembertForSequenceClassification`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.CamembertForSequenceClassification 36 | :members: 37 | 38 | 39 | ``CamembertForMultipleChoice`` 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.CamembertForMultipleChoice 43 | :members: 44 | 45 | 46 | ``CamembertForTokenClassification`` 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | .. autoclass:: transformers.CamembertForTokenClassification 50 | :members: 51 | -------------------------------------------------------------------------------- /docs/source/model_doc/gpt2.rst: -------------------------------------------------------------------------------- 1 | OpenAI GPT2 2 | ---------------------------------------------------- 3 | 4 | ``GPT2Config`` 5 | ~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.GPT2Config 8 | :members: 9 | 10 | 11 | ``GPT2Tokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.GPT2Tokenizer 15 | :members: 16 | 17 | 18 | ``GPT2Model`` 19 | ~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.GPT2Model 22 | :members: 23 | 24 | 25 | ``GPT2LMHeadModel`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.GPT2LMHeadModel 29 | :members: 30 | 31 | 32 | ``GPT2DoubleHeadsModel`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.GPT2DoubleHeadsModel 36 | :members: 37 | 38 | 39 | ``TFGPT2Model`` 40 | ~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.TFGPT2Model 43 | :members: 44 | 45 | 46 | ``TFGPT2LMHeadModel`` 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | .. autoclass:: transformers.TFGPT2LMHeadModel 50 | :members: 51 | 52 | 53 | ``TFGPT2DoubleHeadsModel`` 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | .. autoclass:: transformers.TFGPT2DoubleHeadsModel 57 | :members: 58 | -------------------------------------------------------------------------------- /docs/source/model_doc/ctrl.rst: -------------------------------------------------------------------------------- 1 | CTRL 2 | ---------------------------------------------------- 3 | 4 | Note: if you fine-tune a CTRL model using the Salesforce code (https://github.com/salesforce/ctrl), 5 | you'll be able to convert from TF to our HuggingFace/Transformers format using the 6 | ``convert_tf_to_huggingface_pytorch.py`` script (see `issue #1654 `_). 7 | 8 | 9 | ``CTRLConfig`` 10 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 11 | 12 | .. autoclass:: transformers.CTRLConfig 13 | :members: 14 | 15 | 16 | ``CTRLTokenizer`` 17 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | 19 | .. autoclass:: transformers.CTRLTokenizer 20 | :members: 21 | 22 | 23 | ``CTRLModel`` 24 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 25 | 26 | .. autoclass:: transformers.CTRLModel 27 | :members: 28 | 29 | 30 | ``CTRLLMHeadModel`` 31 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 32 | 33 | .. autoclass:: transformers.CTRLLMHeadModel 34 | :members: 35 | 36 | 37 | ``TFCTRLModel`` 38 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 39 | 40 | .. autoclass:: transformers.TFCTRLModel 41 | :members: 42 | 43 | 44 | ``TFCTRLLMHeadModel`` 45 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 46 | 47 | .. autoclass:: transformers.TFCTRLLMHeadModel 48 | :members: 49 | 50 | -------------------------------------------------------------------------------- /src/transformers/configuration_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ CamemBERT configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_roberta import RobertaConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json", 28 | } 29 | 30 | 31 | class CamembertConfig(RobertaConfig): 32 | pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 33 | -------------------------------------------------------------------------------- /docs/source/model_doc/roberta.rst: -------------------------------------------------------------------------------- 1 | RoBERTa 2 | ---------------------------------------------------- 3 | 4 | ``RobertaConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.RobertaConfig 8 | :members: 9 | 10 | 11 | ``RobertaTokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.RobertaTokenizer 15 | :members: 16 | 17 | 18 | ``RobertaModel`` 19 | ~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.RobertaModel 22 | :members: 23 | 24 | 25 | ``RobertaForMaskedLM`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.RobertaForMaskedLM 29 | :members: 30 | 31 | 32 | ``RobertaForSequenceClassification`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.RobertaForSequenceClassification 36 | :members: 37 | 38 | 39 | ``TFRobertaModel`` 40 | ~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.TFRobertaModel 43 | :members: 44 | 45 | 46 | ``TFRobertaForMaskedLM`` 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | .. autoclass:: transformers.TFRobertaForMaskedLM 50 | :members: 51 | 52 | 53 | ``TFRobertaForSequenceClassification`` 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | .. autoclass:: transformers.TFRobertaForSequenceClassification 57 | :members: 58 | -------------------------------------------------------------------------------- /docs/source/model_doc/gpt.rst: -------------------------------------------------------------------------------- 1 | OpenAI GPT 2 | ---------------------------------------------------- 3 | 4 | ``OpenAIGPTConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.OpenAIGPTConfig 8 | :members: 9 | 10 | 11 | ``OpenAIGPTTokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.OpenAIGPTTokenizer 15 | :members: 16 | 17 | 18 | ``OpenAIGPTModel`` 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.OpenAIGPTModel 22 | :members: 23 | 24 | 25 | ``OpenAIGPTLMHeadModel`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.OpenAIGPTLMHeadModel 29 | :members: 30 | 31 | 32 | ``OpenAIGPTDoubleHeadsModel`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.OpenAIGPTDoubleHeadsModel 36 | :members: 37 | 38 | 39 | ``TFOpenAIGPTModel`` 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.TFOpenAIGPTModel 43 | :members: 44 | 45 | 46 | ``TFOpenAIGPTLMHeadModel`` 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | .. autoclass:: transformers.TFOpenAIGPTLMHeadModel 50 | :members: 51 | 52 | 53 | ``TFOpenAIGPTDoubleHeadsModel`` 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | .. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel 57 | :members: 58 | -------------------------------------------------------------------------------- /examples/generate_postag_and_labels.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def read_conll(file_name): 4 | with open(file_name, "r") as fin: 5 | data = fin.readlines() 6 | conll_data = [] 7 | sent_data = [] 8 | for line in data: 9 | if len(line.strip()) == 0: 10 | if len(sent_data) > 0: 11 | conll_data.append(sent_data) 12 | sent_data = [] 13 | else: 14 | sent_data.append(line.strip().split("\t")) 15 | if len(sent_data) > 0: 16 | conll_data.append(sent_data) 17 | return conll_data 18 | 19 | if __name__ == '__main__': 20 | 21 | train_file = sys.argv[1] 22 | postag_file = sys.argv[4] 23 | label_file = sys.argv[5] 24 | 25 | all_data = read_conll(train_file) 26 | 27 | postag_set = set() 28 | label_set = set() 29 | for sent in all_data: 30 | for line in sent: 31 | postag_set.add(line[4]) 32 | label_set.add(line[7]) 33 | 34 | postag_list = ['_'] + list(postag_set) 35 | label_list = ['_'] + list(label_set) 36 | 37 | with open(postag_file, "w") as fout: 38 | for item in postag_list: 39 | fout.write(item+"\n") 40 | 41 | with open(label_file, "w") as fout: 42 | for item in label_list: 43 | fout.write(item+"\n") 44 | 45 | -------------------------------------------------------------------------------- /docs/source/model_sharing.md: -------------------------------------------------------------------------------- 1 | # Model upload and sharing 2 | 3 | Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the CLI that's built-in to the library. 4 | 5 | **First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then: 6 | 7 | ```shell 8 | transformers-cli login 9 | # log in using the same credentials as on huggingface.co 10 | ``` 11 | Upload your model: 12 | ```shell 13 | transformers-cli upload ./path/to/pretrained_model/ 14 | 15 | # ^^ Upload folder containing weights/tokenizer/config 16 | # saved via `.save_pretrained()` 17 | 18 | transformers-cli upload ./config.json [--filename folder/foobar.json] 19 | 20 | # ^^ Upload a single file 21 | # (you can optionally override its filename, which can be nested inside a folder) 22 | ``` 23 | 24 | Your model will then be accessible through its identifier, a concatenation of your username and the folder name above: 25 | ```python 26 | "username/pretrained_model" 27 | ``` 28 | 29 | Anyone can load it from code: 30 | ```python 31 | tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model") 32 | model = AutoModel.from_pretrained("username/pretrained_model") 33 | ``` 34 | 35 | Finally, list all your files on S3: 36 | ```shell 37 | transformers-cli ls 38 | # List all your S3 objects. 39 | ``` 40 | 41 | -------------------------------------------------------------------------------- /src/transformers/__main__.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | 3 | 4 | def main(): 5 | import sys 6 | 7 | if len(sys.argv) < 2 or sys.argv[1] not in ["convert", "train", "predict", "serve"]: 8 | print( 9 | "First argument to `transformers` command line interface should be one of: \n" 10 | ">> convert serve train predict" 11 | ) 12 | if sys.argv[1] == "convert": 13 | from transformers.commands import convert 14 | 15 | convert(sys.argv) 16 | elif sys.argv[1] == "train": 17 | from transformers.commands import train 18 | 19 | train(sys.argv) 20 | elif sys.argv[1] == "serve": 21 | pass 22 | # from argparse import ArgumentParser 23 | # from transformers.commands.serving import ServeCommand 24 | # parser = ArgumentParser('Transformers CLI tool', usage='transformers serve []') 25 | # commands_parser = parser.add_subparsers(help='transformers-cli command helpers') 26 | 27 | # # Register commands 28 | # ServeCommand.register_subcommand(commands_parser) 29 | 30 | # # Let's go 31 | # args = parser.parse_args() 32 | 33 | # if not hasattr(args, 'func'): 34 | # parser.print_help() 35 | # exit(1) 36 | # # Run 37 | # service = args.func(args) 38 | # service.run() 39 | 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /src/transformers/commands/download.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from transformers.commands import BaseTransformersCLICommand 4 | 5 | 6 | def download_command_factory(args): 7 | return DownloadCommand(args.model, args.cache_dir, args.force) 8 | 9 | 10 | class DownloadCommand(BaseTransformersCLICommand): 11 | @staticmethod 12 | def register_subcommand(parser: ArgumentParser): 13 | download_parser = parser.add_parser("download") 14 | download_parser.add_argument( 15 | "--cache-dir", type=str, default=None, help="Path to location to store the models" 16 | ) 17 | download_parser.add_argument( 18 | "--force", action="store_true", help="Force the model to be download even if already in cache-dir" 19 | ) 20 | download_parser.add_argument("model", type=str, help="Name of the model to download") 21 | download_parser.set_defaults(func=download_command_factory) 22 | 23 | def __init__(self, model: str, cache: str, force: bool): 24 | self._model = model 25 | self._cache = cache 26 | self._force = force 27 | 28 | def run(self): 29 | from transformers import AutoModel, AutoTokenizer 30 | 31 | AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 32 | AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 33 | -------------------------------------------------------------------------------- /docs/source/model_doc/albert.rst: -------------------------------------------------------------------------------- 1 | ALBERT 2 | ---------------------------------------------------- 3 | 4 | ``AlbrtConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.AlbertConfig 8 | :members: 9 | 10 | 11 | ``AlbertTokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.AlbertTokenizer 15 | :members: 16 | 17 | 18 | ``AlbertModel`` 19 | ~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.AlbertModel 22 | :members: 23 | 24 | 25 | ``AlbertForMaskedLM`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.AlbertForMaskedLM 29 | :members: 30 | 31 | 32 | ``AlbertForSequenceClassification`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.AlbertForSequenceClassification 36 | :members: 37 | 38 | 39 | ``AlbertForQuestionAnswering`` 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.AlbertForQuestionAnswering 43 | :members: 44 | 45 | 46 | ``TFAlbertModel`` 47 | ~~~~~~~~~~~~~~~~~~~~ 48 | 49 | .. autoclass:: transformers.TFAlbertModel 50 | :members: 51 | 52 | 53 | ``TFAlbertForMaskedLM`` 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | .. autoclass:: transformers.TFAlbertForMaskedLM 57 | :members: 58 | 59 | 60 | ``TFAlbertForSequenceClassification`` 61 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 62 | 63 | .. autoclass:: transformers.TFAlbertForSequenceClassification 64 | :members: 65 | -------------------------------------------------------------------------------- /docs/source/bertology.rst: -------------------------------------------------------------------------------- 1 | BERTology 2 | --------- 3 | 4 | There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are: 5 | 6 | 7 | * BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950 8 | * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650 9 | * What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341 10 | 11 | In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted from the great work of Paul Michel (https://arxiv.org/abs/1905.10650): 12 | 13 | 14 | * accessing all the hidden-states of BERT/GPT/GPT-2, 15 | * accessing all the attention weights for each head of BERT/GPT/GPT-2, 16 | * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650. 17 | 18 | To help you understand and use these features, we have added a specific example script: `bertology.py `_ while extract information and prune a model pre-trained on GLUE. 19 | -------------------------------------------------------------------------------- /src/transformers/configuration_mmbt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Copyright (c) HuggingFace Inc. team. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ MMBT configuration """ 17 | 18 | 19 | import logging 20 | 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class MMBTConfig(object): 26 | """Configuration class to store the configuration of a `MMBT Model`. 27 | 28 | Args: 29 | config: config of the underlying Transformer models. It's values are copied over to use a single config. 30 | num_labels: Size of final Linear layer for classification. 31 | modal_hidden_size: Embedding dimension of the non-text modality encoder. 32 | """ 33 | 34 | def __init__(self, config, num_labels=None, modal_hidden_size=2048): 35 | self.__dict__ = config.__dict__ 36 | self.modal_hidden_size = modal_hidden_size 37 | if num_labels: 38 | self.num_labels = num_labels 39 | -------------------------------------------------------------------------------- /docs/source/notebooks.rst: -------------------------------------------------------------------------------- 1 | Notebooks 2 | ================================================ 3 | 4 | We include `three Jupyter Notebooks `_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model. 5 | 6 | 7 | * 8 | The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb `_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models. 9 | 10 | * 11 | The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb `_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models. 12 | 13 | * 14 | The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb `_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model. 15 | 16 | Please follow the instructions given in the notebooks to run and modify them. 17 | -------------------------------------------------------------------------------- /docs/source/model_doc/xlm.rst: -------------------------------------------------------------------------------- 1 | XLM 2 | ---------------------------------------------------- 3 | 4 | ``XLMConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.XLMConfig 8 | :members: 9 | 10 | ``XLMTokenizer`` 11 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 12 | 13 | .. autoclass:: transformers.XLMTokenizer 14 | :members: 15 | 16 | ``XLMModel`` 17 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | 19 | .. autoclass:: transformers.XLMModel 20 | :members: 21 | 22 | 23 | ``XLMWithLMHeadModel`` 24 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 25 | 26 | .. autoclass:: transformers.XLMWithLMHeadModel 27 | :members: 28 | 29 | 30 | ``XLMForSequenceClassification`` 31 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 32 | 33 | .. autoclass:: transformers.XLMForSequenceClassification 34 | :members: 35 | 36 | 37 | ``XLMForQuestionAnswering`` 38 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 39 | 40 | .. autoclass:: transformers.XLMForQuestionAnswering 41 | :members: 42 | 43 | 44 | ``TFXLMModel`` 45 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 46 | 47 | .. autoclass:: transformers.TFXLMModel 48 | :members: 49 | 50 | 51 | ``TFXLMWithLMHeadModel`` 52 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 53 | 54 | .. autoclass:: transformers.TFXLMWithLMHeadModel 55 | :members: 56 | 57 | 58 | ``TFXLMForSequenceClassification`` 59 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 60 | 61 | .. autoclass:: transformers.TFXLMForSequenceClassification 62 | :members: 63 | 64 | 65 | ``TFXLMForQuestionAnsweringSimple`` 66 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 67 | 68 | .. autoclass:: transformers.TFXLMForQuestionAnsweringSimple 69 | :members: 70 | -------------------------------------------------------------------------------- /tests/test_tokenization_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 HuggingFace Inc.. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import unittest 18 | 19 | from transformers import PreTrainedTokenizer 20 | from transformers.tokenization_gpt2 import GPT2Tokenizer 21 | 22 | from .utils import slow 23 | 24 | 25 | class TokenizerUtilsTest(unittest.TestCase): 26 | def check_tokenizer_from_pretrained(self, tokenizer_class): 27 | s3_models = list(tokenizer_class.max_model_input_sizes.keys()) 28 | for model_name in s3_models[:1]: 29 | tokenizer = tokenizer_class.from_pretrained(model_name) 30 | self.assertIsNotNone(tokenizer) 31 | self.assertIsInstance(tokenizer, tokenizer_class) 32 | self.assertIsInstance(tokenizer, PreTrainedTokenizer) 33 | 34 | for special_tok in tokenizer.all_special_tokens: 35 | self.assertIsInstance(special_tok, str) 36 | special_tok_id = tokenizer.convert_tokens_to_ids(special_tok) 37 | self.assertIsInstance(special_tok_id, int) 38 | 39 | @slow 40 | def test_pretrained_tokenizers(self): 41 | self.check_tokenizer_from_pretrained(GPT2Tokenizer) 42 | -------------------------------------------------------------------------------- /docs/source/model_doc/distilbert.rst: -------------------------------------------------------------------------------- 1 | DistilBERT 2 | ---------------------------------------------------- 3 | 4 | ``DistilBertConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.DistilBertConfig 8 | :members: 9 | 10 | 11 | ``DistilBertTokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.DistilBertTokenizer 15 | :members: 16 | 17 | 18 | ``DistilBertModel`` 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.DistilBertModel 22 | :members: 23 | 24 | 25 | ``DistilBertForMaskedLM`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.DistilBertForMaskedLM 29 | :members: 30 | 31 | 32 | ``DistilBertForSequenceClassification`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.DistilBertForSequenceClassification 36 | :members: 37 | 38 | 39 | ``DistilBertForQuestionAnswering`` 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.DistilBertForQuestionAnswering 43 | :members: 44 | 45 | ``TFDistilBertModel`` 46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 47 | 48 | .. autoclass:: transformers.TFDistilBertModel 49 | :members: 50 | 51 | 52 | ``TFDistilBertForMaskedLM`` 53 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 54 | 55 | .. autoclass:: transformers.TFDistilBertForMaskedLM 56 | :members: 57 | 58 | 59 | ``TFDistilBertForSequenceClassification`` 60 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 61 | 62 | .. autoclass:: transformers.TFDistilBertForSequenceClassification 63 | :members: 64 | 65 | 66 | ``TFDistilBertForQuestionAnswering`` 67 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 68 | 69 | .. autoclass:: transformers.TFDistilBertForQuestionAnswering 70 | :members: 71 | -------------------------------------------------------------------------------- /docs/source/model_doc/xlnet.rst: -------------------------------------------------------------------------------- 1 | XLNet 2 | ---------------------------------------------------- 3 | 4 | ``XLNetConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.XLNetConfig 8 | :members: 9 | 10 | 11 | ``XLNetTokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.XLNetTokenizer 15 | :members: 16 | 17 | 18 | ``XLNetModel`` 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.XLNetModel 22 | :members: 23 | 24 | 25 | ``XLNetLMHeadModel`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.XLNetLMHeadModel 29 | :members: 30 | 31 | 32 | ``XLNetForSequenceClassification`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.XLNetForSequenceClassification 36 | :members: 37 | 38 | 39 | ``XLNetForQuestionAnswering`` 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.XLNetForQuestionAnswering 43 | :members: 44 | 45 | 46 | ``TFXLNetModel`` 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | .. autoclass:: transformers.TFXLNetModel 50 | :members: 51 | 52 | 53 | ``TFXLNetLMHeadModel`` 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | .. autoclass:: transformers.TFXLNetLMHeadModel 57 | :members: 58 | 59 | 60 | ``TFXLNetForSequenceClassification`` 61 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 62 | 63 | .. autoclass:: transformers.TFXLNetForSequenceClassification 64 | :members: 65 | 66 | 67 | ``TFXLNetForQuestionAnsweringSimple`` 68 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 69 | 70 | .. autoclass:: transformers.TFXLNetForQuestionAnsweringSimple 71 | :members: 72 | -------------------------------------------------------------------------------- /src/transformers/configuration_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ RoBERTa configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_bert import BertConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", 28 | "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", 29 | "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", 30 | "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json", 31 | "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json", 32 | "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json", 33 | } 34 | 35 | 36 | class RobertaConfig(BertConfig): 37 | pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 38 | -------------------------------------------------------------------------------- /tests/test_tokenization_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from transformers.tokenization_distilbert import DistilBertTokenizer 18 | 19 | from .test_tokenization_bert import BertTokenizationTest 20 | from .utils import slow 21 | 22 | 23 | class DistilBertTokenizationTest(BertTokenizationTest): 24 | 25 | tokenizer_class = DistilBertTokenizer 26 | 27 | def get_tokenizer(self, **kwargs): 28 | return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs) 29 | 30 | @slow 31 | def test_sequence_builders(self): 32 | tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") 33 | 34 | text = tokenizer.encode("sequence builders", add_special_tokens=False) 35 | text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) 36 | 37 | encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) 38 | encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) 39 | 40 | assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] 41 | assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [ 42 | tokenizer.sep_token_id 43 | ] 44 | -------------------------------------------------------------------------------- /src/transformers/configuration_xlm_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XLM-RoBERTa configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_roberta import RobertaConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json", 28 | "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json", 29 | "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json", 30 | "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json", 31 | "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json", 32 | "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json", 33 | } 34 | 35 | 36 | class XLMRobertaConfig(RobertaConfig): 37 | pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 38 | -------------------------------------------------------------------------------- /tests/test_modeling_encoder_decoder.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Hugging Face Inc. Team 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | import unittest 18 | 19 | from transformers import is_torch_available 20 | 21 | from .utils import require_torch, slow 22 | 23 | 24 | if is_torch_available(): 25 | from transformers import BertModel, BertForMaskedLM, Model2Model 26 | from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP 27 | 28 | 29 | @require_torch 30 | class EncoderDecoderModelTest(unittest.TestCase): 31 | @slow 32 | def test_model2model_from_pretrained(self): 33 | logging.basicConfig(level=logging.INFO) 34 | for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 35 | model = Model2Model.from_pretrained(model_name) 36 | self.assertIsInstance(model.encoder, BertModel) 37 | self.assertIsInstance(model.decoder, BertForMaskedLM) 38 | self.assertEqual(model.decoder.config.is_decoder, True) 39 | self.assertEqual(model.encoder.config.is_decoder, False) 40 | 41 | def test_model2model_from_pretrained_not_bert(self): 42 | logging.basicConfig(level=logging.INFO) 43 | with self.assertRaises(ValueError): 44 | _ = Model2Model.from_pretrained("roberta") 45 | 46 | with self.assertRaises(ValueError): 47 | _ = Model2Model.from_pretrained("distilbert") 48 | 49 | with self.assertRaises(ValueError): 50 | _ = Model2Model.from_pretrained("does-not-exist") 51 | -------------------------------------------------------------------------------- /tests/test_tokenization_auto.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import logging 18 | import unittest 19 | 20 | from transformers import ( 21 | BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, 22 | GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, 23 | AutoTokenizer, 24 | BertTokenizer, 25 | GPT2Tokenizer, 26 | ) 27 | 28 | from .utils import SMALL_MODEL_IDENTIFIER, slow 29 | 30 | 31 | class AutoTokenizerTest(unittest.TestCase): 32 | @slow 33 | def test_tokenizer_from_pretrained(self): 34 | logging.basicConfig(level=logging.INFO) 35 | for model_name in list(BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]: 36 | tokenizer = AutoTokenizer.from_pretrained(model_name) 37 | self.assertIsNotNone(tokenizer) 38 | self.assertIsInstance(tokenizer, BertTokenizer) 39 | self.assertGreater(len(tokenizer), 0) 40 | 41 | for model_name in list(GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]: 42 | tokenizer = AutoTokenizer.from_pretrained(model_name) 43 | self.assertIsNotNone(tokenizer) 44 | self.assertIsInstance(tokenizer, GPT2Tokenizer) 45 | self.assertGreater(len(tokenizer), 0) 46 | 47 | def test_tokenizer_from_pretrained_identifier(self): 48 | logging.basicConfig(level=logging.INFO) 49 | tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER) 50 | self.assertIsInstance(tokenizer, BertTokenizer) 51 | self.assertEqual(len(tokenizer), 12) 52 | -------------------------------------------------------------------------------- /docs/source/main_classes/optimizer_schedules.rst: -------------------------------------------------------------------------------- 1 | Optimizer 2 | ---------------------------------------------------- 3 | 4 | The ``.optimization`` module provides: 5 | 6 | - an optimizer with weight decay fixed that can be used to fine-tuned models, and 7 | - several schedules in the form of schedule objects that inherit from ``_LRSchedule``: 8 | - a gradient accumulation class to accumulate the gradients of multiple batches 9 | 10 | ``AdamW`` 11 | ~~~~~~~~~~~~~~~~ 12 | 13 | .. autoclass:: transformers.AdamW 14 | :members: 15 | 16 | ``AdamWeightDecay`` 17 | ~~~~~~~~~~~~~~~~~~~ 18 | 19 | .. autoclass:: transformers.AdamWeightDecay 20 | :members: 21 | 22 | .. autofunction:: transformers.create_optimizer 23 | :members: 24 | 25 | Schedules 26 | ---------------------------------------------------- 27 | 28 | Learning Rate Schedules 29 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 30 | 31 | .. autofunction:: transformers.get_constant_schedule 32 | 33 | 34 | .. autofunction:: transformers.get_constant_schedule_with_warmup 35 | 36 | .. image:: /imgs/warmup_constant_schedule.png 37 | :target: /imgs/warmup_constant_schedule.png 38 | :alt: 39 | 40 | 41 | .. autofunction:: transformers.get_cosine_schedule_with_warmup 42 | :members: 43 | 44 | .. image:: /imgs/warmup_cosine_schedule.png 45 | :target: /imgs/warmup_cosine_schedule.png 46 | :alt: 47 | 48 | 49 | .. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup 50 | 51 | .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png 52 | :target: /imgs/warmup_cosine_hard_restarts_schedule.png 53 | :alt: 54 | 55 | 56 | 57 | .. autofunction:: transformers.get_linear_schedule_with_warmup 58 | 59 | .. image:: /imgs/warmup_linear_schedule.png 60 | :target: /imgs/warmup_linear_schedule.png 61 | :alt: 62 | 63 | ``Warmup`` 64 | ~~~~~~~~~~~~~~~~ 65 | 66 | .. autoclass:: transformers.Warmup 67 | :members: 68 | 69 | Gradient Strategies 70 | ---------------------------------------------------- 71 | 72 | ``GradientAccumulator`` 73 | ~~~~~~~~~~~~~~~~~~~~~~~ 74 | 75 | .. autoclass:: transformers.GradientAccumulator 76 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Generating the documentation 2 | 3 | To generate the documentation, you first have to build it. Several packages are necessary to build the doc, 4 | you can install them with the following command, at the root of the code repository: 5 | 6 | ```bash 7 | pip install -e .[docs] 8 | ``` 9 | 10 | ## Packages installed 11 | 12 | Here's an overview of all the packages installed. If you ran the previous command installing all packages from 13 | `requirements.txt`, you do not need to run the following commands. 14 | 15 | Building it requires the package `sphinx` that you can 16 | install using: 17 | 18 | ```bash 19 | pip install -U sphinx 20 | ``` 21 | 22 | You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by 23 | [Read The Docs](https://readthedocs.org/). You can install it using the following command: 24 | 25 | ```bash 26 | pip install sphinx_rtd_theme 27 | ``` 28 | 29 | The third necessary package is the `recommonmark` package to accept Markdown as well as Restructured text: 30 | 31 | ```bash 32 | pip install recommonmark 33 | ``` 34 | 35 | ## Building the documentation 36 | 37 | Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following 38 | command to generate it: 39 | 40 | ```bash 41 | ln -s ../../examples/README.md examples.md 42 | ``` 43 | 44 | Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder: 45 | 46 | ```bash 47 | make html 48 | ``` 49 | 50 | --- 51 | **NOTE** 52 | 53 | If you are adding/removing elements from the toc-tree or from any structural item, it is recommended to clean the build 54 | directory before rebuilding. Run the following command to clean and build: 55 | 56 | ```bash 57 | make clean && make html 58 | ``` 59 | 60 | --- 61 | 62 | It should build the static app that will be available under `/docs/_build/html` 63 | 64 | ## Adding a new element to the tree (toc-tree) 65 | 66 | Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it 67 | in the source directory. You can then link it to the toc-tree by putting the filename without the extension. 68 | -------------------------------------------------------------------------------- /templates/adding_a_new_model/tests/test_tokenization_xxx.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 XXX Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import os 18 | import unittest 19 | 20 | from transformers.tokenization_bert import VOCAB_FILES_NAMES, XxxTokenizer 21 | 22 | from .test_tokenization_common import TokenizerTesterMixin 23 | 24 | 25 | class XxxTokenizationTest(TokenizerTesterMixin, unittest.TestCase): 26 | 27 | tokenizer_class = XxxTokenizer 28 | 29 | def setUp(self): 30 | super(XxxTokenizationTest, self).setUp() 31 | 32 | vocab_tokens = [ 33 | "[UNK]", 34 | "[CLS]", 35 | "[SEP]", 36 | "want", 37 | "##want", 38 | "##ed", 39 | "wa", 40 | "un", 41 | "runn", 42 | "##ing", 43 | ",", 44 | "low", 45 | "lowest", 46 | ] 47 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) 48 | with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: 49 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) 50 | 51 | def get_tokenizer(self, **kwargs): 52 | return XxxTokenizer.from_pretrained(self.tmpdirname, **kwargs) 53 | 54 | def get_input_output_texts(self): 55 | input_text = "UNwant\u00E9d,running" 56 | output_text = "unwanted, running" 57 | return input_text, output_text 58 | 59 | def test_full_tokenizer(self): 60 | tokenizer = self.tokenizer_class(self.vocab_file) 61 | 62 | tokens = tokenizer.tokenize("UNwant\u00E9d,running") 63 | self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) 64 | self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) 65 | -------------------------------------------------------------------------------- /src/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert T5 checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import T5Config, T5Model, load_tf_weights_in_t5 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = T5Config.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = T5Model(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_t5(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained T5 model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert XXX checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = XxxConfig.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = XxxForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_xxx(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = BertConfig.from_json_file(bert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = BertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_bert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--bert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained BERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert ALBERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = AlbertConfig.from_json_file(albert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = AlbertForMaskedLM(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_albert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--albert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained ALBERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /utils/link_tester.py: -------------------------------------------------------------------------------- 1 | """ Link tester. 2 | 3 | This little utility reads all the python files in the repository, 4 | scans for links pointing to S3 and tests the links one by one. Raises an error 5 | at the end of the scan if at least one link was reported broken. 6 | """ 7 | import os 8 | import re 9 | import sys 10 | 11 | import requests 12 | 13 | 14 | REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1""" 15 | 16 | 17 | def list_python_files_in_repository(): 18 | """ List all python files in the repository. 19 | 20 | This function assumes that the script is executed in the root folder. 21 | """ 22 | source_code_files = [] 23 | for path, subdirs, files in os.walk("."): 24 | if "templates" in path: 25 | continue 26 | for name in files: 27 | if ".py" in name and ".pyc" not in name: 28 | path_to_files = os.path.join(path, name) 29 | source_code_files.append(path_to_files) 30 | 31 | return source_code_files 32 | 33 | 34 | def find_all_links(file_paths): 35 | links = [] 36 | for path in file_paths: 37 | links += scan_code_for_links(path) 38 | 39 | return links 40 | 41 | 42 | def scan_code_for_links(source): 43 | """ Scans the file to find links using a regular expression. 44 | Returns a list of links. 45 | """ 46 | with open(source, "r") as content: 47 | content = content.read() 48 | raw_links = re.findall(REGEXP_FIND_S3_LINKS, content) 49 | links = [prefix + suffix for _, prefix, suffix in raw_links] 50 | 51 | return links 52 | 53 | 54 | def check_all_links(links): 55 | """ Check that the provided links are valid. 56 | 57 | Links are considered valid if a HEAD request to the server 58 | returns a 200 status code. 59 | """ 60 | broken_links = [] 61 | for link in links: 62 | head = requests.head(link) 63 | if head.status_code != 200: 64 | broken_links.append(link) 65 | 66 | return broken_links 67 | 68 | 69 | if __name__ == "__main__": 70 | file_paths = list_python_files_in_repository() 71 | links = find_all_links(file_paths) 72 | broken_links = check_all_links(links) 73 | print("Looking for broken links to pre-trained models/configs/tokenizers...") 74 | if broken_links: 75 | print("The following links did not respond:") 76 | for link in broken_links: 77 | print("- {}".format(link)) 78 | sys.exit(1) 79 | print("All links are ok.") 80 | -------------------------------------------------------------------------------- /docs/source/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | Transformers is tested on Python 3.5+ and PyTorch 1.1.0 4 | 5 | ## With pip 6 | 7 | PyTorch Transformers can be installed using pip as follows: 8 | 9 | ``` bash 10 | pip install transformers 11 | ``` 12 | 13 | ## From source 14 | 15 | To install from source, clone the repository and install with: 16 | 17 | ``` bash 18 | git clone https://github.com/huggingface/transformers.git 19 | cd transformers 20 | pip install . 21 | ``` 22 | 23 | ## Tests 24 | 25 | An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples). 26 | 27 | Refer to the [contributing guide](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#tests) for details about running tests. 28 | 29 | ## OpenAI GPT original tokenization workflow 30 | 31 | If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` and `SpaCy`: 32 | 33 | ``` bash 34 | pip install spacy ftfy==4.4.3 35 | python -m spacy download en 36 | ``` 37 | 38 | If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry). 39 | 40 | ## Note on model downloads (Continuous Integration or large-scale deployments) 41 | 42 | If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help. 43 | 44 | ## Do you want to run a Transformer model on a mobile device? 45 | 46 | You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo. 47 | 48 | It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices. 49 | 50 | At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML, 51 | or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting! 52 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import unittest 4 | from distutils.util import strtobool 5 | 6 | from transformers.file_utils import _tf_available, _torch_available 7 | 8 | 9 | CACHE_DIR = os.path.join(tempfile.gettempdir(), "transformers_test") 10 | 11 | SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy" 12 | 13 | 14 | def parse_flag_from_env(key, default=False): 15 | try: 16 | value = os.environ[key] 17 | except KeyError: 18 | # KEY isn't set, default to `default`. 19 | _value = default 20 | else: 21 | # KEY is set, convert it to True or False. 22 | try: 23 | _value = strtobool(value) 24 | except ValueError: 25 | # More values are supported, but let's keep the message simple. 26 | raise ValueError("If set, {} must be yes or no.".format(key)) 27 | return _value 28 | 29 | 30 | _run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False) 31 | _run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False) 32 | 33 | 34 | def slow(test_case): 35 | """ 36 | Decorator marking a test as slow. 37 | 38 | Slow tests are skipped by default. Set the RUN_SLOW environment variable 39 | to a truthy value to run them. 40 | 41 | """ 42 | if not _run_slow_tests: 43 | test_case = unittest.skip("test is slow")(test_case) 44 | return test_case 45 | 46 | 47 | def custom_tokenizers(test_case): 48 | """ 49 | Decorator marking a test for a custom tokenizer. 50 | 51 | Custom tokenizers require additional dependencies, and are skipped 52 | by default. Set the RUN_CUSTOM_TOKENIZERS environment variable 53 | to a truthy value to run them. 54 | """ 55 | if not _run_custom_tokenizers: 56 | test_case = unittest.skip("test of custom tokenizers")(test_case) 57 | return test_case 58 | 59 | 60 | def require_torch(test_case): 61 | """ 62 | Decorator marking a test that requires PyTorch. 63 | 64 | These tests are skipped when PyTorch isn't installed. 65 | 66 | """ 67 | if not _torch_available: 68 | test_case = unittest.skip("test requires PyTorch")(test_case) 69 | return test_case 70 | 71 | 72 | def require_tf(test_case): 73 | """ 74 | Decorator marking a test that requires TensorFlow. 75 | 76 | These tests are skipped when TensorFlow isn't installed. 77 | 78 | """ 79 | if not _tf_available: 80 | test_case = unittest.skip("test requires TensorFlow")(test_case) 81 | return test_case 82 | 83 | 84 | if _torch_available: 85 | # Set the USE_CUDA environment variable to select a GPU. 86 | torch_device = "cuda" if parse_flag_from_env("USE_CUDA") else "cpu" 87 | else: 88 | torch_device = None 89 | -------------------------------------------------------------------------------- /src/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): 30 | # Construct model 31 | if gpt2_config_file == "": 32 | config = GPT2Config() 33 | else: 34 | config = GPT2Config.from_json_file(gpt2_config_file) 35 | model = GPT2Model(config) 36 | 37 | # Load weights from numpy 38 | load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) 39 | 40 | # Save pytorch-model 41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 44 | torch.save(model.state_dict(), pytorch_weights_dump_path) 45 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 47 | f.write(config.to_json_string()) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | # Required parameters 53 | parser.add_argument( 54 | "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 55 | ) 56 | parser.add_argument( 57 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 58 | ) 59 | parser.add_argument( 60 | "--gpt2_config_file", 61 | default="", 62 | type=str, 63 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n" 64 | "This specifies the model architecture.", 65 | ) 66 | args = parser.parse_args() 67 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path) 68 | -------------------------------------------------------------------------------- /tests/test_configuration_common.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 HuggingFace Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import json 18 | import os 19 | import tempfile 20 | 21 | 22 | class ConfigTester(object): 23 | def __init__(self, parent, config_class=None, **kwargs): 24 | self.parent = parent 25 | self.config_class = config_class 26 | self.inputs_dict = kwargs 27 | 28 | def create_and_test_config_common_properties(self): 29 | config = self.config_class(**self.inputs_dict) 30 | self.parent.assertTrue(hasattr(config, "vocab_size")) 31 | self.parent.assertTrue(hasattr(config, "hidden_size")) 32 | self.parent.assertTrue(hasattr(config, "num_attention_heads")) 33 | self.parent.assertTrue(hasattr(config, "num_hidden_layers")) 34 | 35 | def create_and_test_config_to_json_string(self): 36 | config = self.config_class(**self.inputs_dict) 37 | obj = json.loads(config.to_json_string()) 38 | for key, value in self.inputs_dict.items(): 39 | self.parent.assertEqual(obj[key], value) 40 | 41 | def create_and_test_config_to_json_file(self): 42 | config_first = self.config_class(**self.inputs_dict) 43 | 44 | with tempfile.TemporaryDirectory() as tmpdirname: 45 | json_file_path = os.path.join(tmpdirname, "config.json") 46 | config_first.to_json_file(json_file_path) 47 | config_second = self.config_class.from_json_file(json_file_path) 48 | 49 | self.parent.assertEqual(config_second.to_dict(), config_first.to_dict()) 50 | 51 | def create_and_test_config_from_and_save_pretrained(self): 52 | config_first = self.config_class(**self.inputs_dict) 53 | 54 | with tempfile.TemporaryDirectory() as tmpdirname: 55 | config_first.save_pretrained(tmpdirname) 56 | config_second = self.config_class.from_pretrained(tmpdirname) 57 | 58 | self.parent.assertEqual(config_second.to_dict(), config_first.to_dict()) 59 | 60 | def run_common_tests(self): 61 | self.create_and_test_config_common_properties() 62 | self.create_and_test_config_to_json_string() 63 | self.create_and_test_config_to_json_file() 64 | self.create_and_test_config_from_and_save_pretrained() 65 | -------------------------------------------------------------------------------- /tests/test_tokenization_ctrl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Salesforce and HuggingFace Inc. team. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import json 17 | import os 18 | import unittest 19 | 20 | from transformers.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer 21 | 22 | from .test_tokenization_common import TokenizerTesterMixin 23 | 24 | 25 | class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): 26 | 27 | tokenizer_class = CTRLTokenizer 28 | 29 | def setUp(self): 30 | super(CTRLTokenizationTest, self).setUp() 31 | 32 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 33 | vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", ""] 34 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 35 | merges = ["#version: 0.2", "a p", "ap t", "r e", "a d", "ad apt", ""] 36 | self.special_tokens_map = {"unk_token": ""} 37 | 38 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) 39 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) 40 | with open(self.vocab_file, "w", encoding="utf-8") as fp: 41 | fp.write(json.dumps(vocab_tokens) + "\n") 42 | with open(self.merges_file, "w", encoding="utf-8") as fp: 43 | fp.write("\n".join(merges)) 44 | 45 | def get_tokenizer(self, **kwargs): 46 | kwargs.update(self.special_tokens_map) 47 | return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs) 48 | 49 | def get_input_output_texts(self): 50 | input_text = "adapt react readapt apt" 51 | output_text = "adapt react readapt apt" 52 | return input_text, output_text 53 | 54 | def test_full_tokenizer(self): 55 | tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) 56 | text = "adapt react readapt apt" 57 | bpe_tokens = "adapt re@@ a@@ c@@ t re@@ adapt apt".split() 58 | tokens = tokenizer.tokenize(text) 59 | self.assertListEqual(tokens, bpe_tokens) 60 | 61 | input_tokens = tokens + [tokenizer.unk_token] 62 | 63 | input_bpe_tokens = [0, 1, 2, 4, 5, 1, 0, 3, 6] 64 | self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 65 | -------------------------------------------------------------------------------- /src/transformers/tokenization_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for DistilBERT.""" 16 | 17 | 18 | import logging 19 | 20 | from .tokenization_bert import BertTokenizer 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 26 | 27 | PRETRAINED_VOCAB_FILES_MAP = { 28 | "vocab_file": { 29 | "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 30 | "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", 31 | "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt", 32 | "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", 33 | } 34 | } 35 | 36 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 37 | "distilbert-base-uncased": 512, 38 | "distilbert-base-uncased-distilled-squad": 512, 39 | "distilbert-base-german-cased": 512, 40 | "distilbert-base-multilingual-cased": 512, 41 | } 42 | 43 | 44 | class DistilBertTokenizer(BertTokenizer): 45 | r""" 46 | Constructs a DistilBertTokenizer. 47 | :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece 48 | 49 | Args: 50 | vocab_file: Path to a one-wordpiece-per-line vocabulary file 51 | do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True 52 | do_basic_tokenize: Whether to do basic tokenization before wordpiece. 53 | max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the 54 | minimum of this value (if specified) and the underlying BERT model's sequence length. 55 | never_split: List of tokens which will never be split during tokenization. Only has an effect when 56 | do_basic_tokenize=True 57 | """ 58 | 59 | vocab_files_names = VOCAB_FILES_NAMES 60 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 61 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 62 | -------------------------------------------------------------------------------- /docs/source/benchmarks.md: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | 3 | This section is dedicated to the Benchmarks done by the library, both by maintainers, contributors and users. These 4 | benchmark will help keep track of the preformance improvements that are brought to our models across versions. 5 | 6 | ## Benchmarking all models for inference 7 | 8 | As of version 2.1 we have benchmarked all models for inference, across many different settings: using PyTorch, with 9 | and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for 10 | TensorFlow XLA) and GPUs. 11 | 12 | The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2) 13 | 14 | The results are available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing). 15 | 16 | ## TF2 with mixed precision, XLA, Distribution (@tlkh) 17 | 18 | This work was done by [Timothy Liu](https://github.com/tlkh). 19 | 20 | There are very positive results to be gained from the various TensorFlow 2.0 features: 21 | 22 | - Automatic Mixed Precision (AMP) 23 | - XLA compiler 24 | - Distribution strategies (multi-GPU) 25 | 26 | The benefits are listed here (tested on CoLA, MRPC, SST-2): 27 | 28 | - AMP: Between 1.4x to 1.6x decrease in overall time without change in batch size 29 | - AMP+XLA: Up to 2.5x decrease in overall time on SST-2 (larger dataset) 30 | - Distribution: Between 1.4x to 3.4x decrease in overall time on 4xV100 31 | - Combined: Up to 5.7x decrease in overall training time, or 9.1x training throughput 32 | 33 | The model quality (measured by the validation accuracy) fluctuates slightly. Taking an average of 4 training runs 34 | on a single GPU gives the following results: 35 | 36 | - CoLA: AMP results in slighter lower acc (0.820 vs 0.824) 37 | - MRPC: AMP results in lower acc (0.823 vs 0.835) 38 | - SST-2: AMP results in slighter lower acc (0.918 vs 0.922) 39 | 40 | However, in a distributed setting with 4xV100 (4x batch size), AMP can yield in better results: 41 | 42 | CoLA: AMP results in higher acc (0.828 vs 0.812) 43 | MRPC: AMP results in lower acc (0.817 vs 0.827) 44 | SST-2: AMP results in slightly lower acc (0.926 vs 0.929) 45 | 46 | The benchmark script is available [here](https://github.com/NVAITC/benchmarking/blob/master/tf2/bert_dist.py). 47 | 48 | Note: on some tasks (e.g. MRPC), the dataset is too small. The overhead due to the model compilation with XLA as well 49 | as the distribution strategy setup does not speed things up. The XLA compile time is also the reason why although throughput 50 | can increase a lot (e.g. 2.7x for single GPU), overall (end-to-end) training speed-up is not as fast (as low as 1.4x) 51 | 52 | The benefits as seen on SST-2 (larger dataset) is much clear. 53 | 54 | All results can be seen on this [Google Sheet](https://docs.google.com/spreadsheets/d/1538MN224EzjbRL239sqSiUy6YY-rAjHyXhTzz_Zptls/edit#gid=960868445). 55 | -------------------------------------------------------------------------------- /src/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): 30 | # Construct model 31 | if openai_config_file == "": 32 | config = OpenAIGPTConfig() 33 | else: 34 | config = OpenAIGPTConfig.from_json_file(openai_config_file) 35 | model = OpenAIGPTModel(config) 36 | 37 | # Load weights from numpy 38 | load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path) 39 | 40 | # Save pytorch-model 41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 44 | torch.save(model.state_dict(), pytorch_weights_dump_path) 45 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 47 | f.write(config.to_json_string()) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | # Required parameters 53 | parser.add_argument( 54 | "--openai_checkpoint_folder_path", 55 | default=None, 56 | type=str, 57 | required=True, 58 | help="Path to the TensorFlow checkpoint path.", 59 | ) 60 | parser.add_argument( 61 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 62 | ) 63 | parser.add_argument( 64 | "--openai_config_file", 65 | default="", 66 | type=str, 67 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n" 68 | "This specifies the model architecture.", 69 | ) 70 | args = parser.parse_args() 71 | convert_openai_checkpoint_to_pytorch( 72 | args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path 73 | ) 74 | -------------------------------------------------------------------------------- /docs/source/model_doc/bert.rst: -------------------------------------------------------------------------------- 1 | BERT 2 | ---------------------------------------------------- 3 | 4 | ``BertConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.BertConfig 8 | :members: 9 | 10 | 11 | ``BertTokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.BertTokenizer 15 | :members: 16 | 17 | 18 | ``BertModel`` 19 | ~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.BertModel 22 | :members: 23 | 24 | 25 | ``BertForPreTraining`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.BertForPreTraining 29 | :members: 30 | 31 | 32 | ``BertForMaskedLM`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.BertForMaskedLM 36 | :members: 37 | 38 | 39 | ``BertForNextSentencePrediction`` 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.BertForNextSentencePrediction 43 | :members: 44 | 45 | 46 | ``BertForSequenceClassification`` 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | .. autoclass:: transformers.BertForSequenceClassification 50 | :members: 51 | 52 | 53 | ``BertForMultipleChoice`` 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | .. autoclass:: transformers.BertForMultipleChoice 57 | :members: 58 | 59 | 60 | ``BertForTokenClassification`` 61 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 62 | 63 | .. autoclass:: transformers.BertForTokenClassification 64 | :members: 65 | 66 | 67 | ``BertForQuestionAnswering`` 68 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 69 | 70 | .. autoclass:: transformers.BertForQuestionAnswering 71 | :members: 72 | 73 | 74 | ``TFBertModel`` 75 | ~~~~~~~~~~~~~~~~~~~~ 76 | 77 | .. autoclass:: transformers.TFBertModel 78 | :members: 79 | 80 | 81 | ``TFBertForPreTraining`` 82 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 83 | 84 | .. autoclass:: transformers.TFBertForPreTraining 85 | :members: 86 | 87 | 88 | ``TFBertForMaskedLM`` 89 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 90 | 91 | .. autoclass:: transformers.TFBertForMaskedLM 92 | :members: 93 | 94 | 95 | ``TFBertForNextSentencePrediction`` 96 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 97 | 98 | .. autoclass:: transformers.TFBertForNextSentencePrediction 99 | :members: 100 | 101 | 102 | ``TFBertForSequenceClassification`` 103 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 104 | 105 | .. autoclass:: transformers.TFBertForSequenceClassification 106 | :members: 107 | 108 | 109 | ``TFBertForMultipleChoice`` 110 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 111 | 112 | .. autoclass:: transformers.TFBertForMultipleChoice 113 | :members: 114 | 115 | 116 | ``TFBertForTokenClassification`` 117 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 118 | 119 | .. autoclass:: transformers.TFBertForTokenClassification 120 | :members: 121 | 122 | 123 | ``TFBertForQuestionAnswering`` 124 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 125 | 126 | .. autoclass:: transformers.TFBertForQuestionAnswering 127 | :members: 128 | 129 | -------------------------------------------------------------------------------- /tests/test_tokenization_openai.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import json 18 | import os 19 | import unittest 20 | 21 | from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer 22 | 23 | from .test_tokenization_common import TokenizerTesterMixin 24 | 25 | 26 | class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): 27 | 28 | tokenizer_class = OpenAIGPTTokenizer 29 | 30 | def setUp(self): 31 | super(OpenAIGPTTokenizationTest, self).setUp() 32 | 33 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 34 | vocab = [ 35 | "l", 36 | "o", 37 | "w", 38 | "e", 39 | "r", 40 | "s", 41 | "t", 42 | "i", 43 | "d", 44 | "n", 45 | "w", 46 | "r", 47 | "t", 48 | "lo", 49 | "low", 50 | "er", 51 | "low", 52 | "lowest", 53 | "newer", 54 | "wider", 55 | "", 56 | ] 57 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 58 | merges = ["#version: 0.2", "l o", "lo w", "e r", ""] 59 | 60 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) 61 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) 62 | with open(self.vocab_file, "w") as fp: 63 | fp.write(json.dumps(vocab_tokens)) 64 | with open(self.merges_file, "w") as fp: 65 | fp.write("\n".join(merges)) 66 | 67 | def get_tokenizer(self, **kwargs): 68 | return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs) 69 | 70 | def get_input_output_texts(self): 71 | input_text = "lower newer" 72 | output_text = "lower newer" 73 | return input_text, output_text 74 | 75 | def test_full_tokenizer(self): 76 | tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file) 77 | 78 | text = "lower" 79 | bpe_tokens = ["low", "er"] 80 | tokens = tokenizer.tokenize(text) 81 | self.assertListEqual(tokens, bpe_tokens) 82 | 83 | input_tokens = tokens + [""] 84 | input_bpe_tokens = [14, 15, 20] 85 | self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 86 | -------------------------------------------------------------------------------- /tests/test_tokenization_transfo_xl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import os 18 | import unittest 19 | 20 | from transformers import is_torch_available 21 | 22 | from .test_tokenization_common import TokenizerTesterMixin 23 | from .utils import require_torch 24 | 25 | 26 | if is_torch_available(): 27 | from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES 28 | 29 | 30 | @require_torch 31 | class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): 32 | 33 | tokenizer_class = TransfoXLTokenizer if is_torch_available() else None 34 | 35 | def setUp(self): 36 | super(TransfoXLTokenizationTest, self).setUp() 37 | 38 | vocab_tokens = [ 39 | "", 40 | "[CLS]", 41 | "[SEP]", 42 | "want", 43 | "unwanted", 44 | "wa", 45 | "un", 46 | "running", 47 | ",", 48 | "low", 49 | "l", 50 | ] 51 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) 52 | with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: 53 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) 54 | 55 | def get_tokenizer(self, **kwargs): 56 | kwargs["lower_case"] = True 57 | return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs) 58 | 59 | def get_input_output_texts(self): 60 | input_text = " UNwanted , running" 61 | output_text = " unwanted, running" 62 | return input_text, output_text 63 | 64 | def test_full_tokenizer(self): 65 | tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True) 66 | 67 | tokens = tokenizer.tokenize(" UNwanted , running") 68 | self.assertListEqual(tokens, ["", "unwanted", ",", "running"]) 69 | 70 | self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7]) 71 | 72 | def test_full_tokenizer_lower(self): 73 | tokenizer = TransfoXLTokenizer(lower_case=True) 74 | 75 | self.assertListEqual( 76 | tokenizer.tokenize(" \tHeLLo ! how \n Are yoU ? "), ["hello", "!", "how", "are", "you", "?"] 77 | ) 78 | 79 | def test_full_tokenizer_no_lower(self): 80 | tokenizer = TransfoXLTokenizer(lower_case=False) 81 | 82 | self.assertListEqual( 83 | tokenizer.tokenize(" \tHeLLo ! how \n Are yoU ? "), ["HeLLo", "!", "how", "Are", "yoU", "?"] 84 | ) 85 | -------------------------------------------------------------------------------- /src/transformers/configuration_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ DistilBERT model configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 26 | "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", 27 | "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json", 28 | "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json", 29 | "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json", 30 | } 31 | 32 | 33 | class DistilBertConfig(PretrainedConfig): 34 | pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 35 | 36 | def __init__( 37 | self, 38 | vocab_size=30522, 39 | max_position_embeddings=512, 40 | sinusoidal_pos_embds=False, 41 | n_layers=6, 42 | n_heads=12, 43 | dim=768, 44 | hidden_dim=4 * 768, 45 | dropout=0.1, 46 | attention_dropout=0.1, 47 | activation="gelu", 48 | initializer_range=0.02, 49 | tie_weights_=True, 50 | qa_dropout=0.1, 51 | seq_classif_dropout=0.2, 52 | **kwargs 53 | ): 54 | super(DistilBertConfig, self).__init__(**kwargs) 55 | self.vocab_size = vocab_size 56 | self.max_position_embeddings = max_position_embeddings 57 | self.sinusoidal_pos_embds = sinusoidal_pos_embds 58 | self.n_layers = n_layers 59 | self.n_heads = n_heads 60 | self.dim = dim 61 | self.hidden_dim = hidden_dim 62 | self.dropout = dropout 63 | self.attention_dropout = attention_dropout 64 | self.activation = activation 65 | self.initializer_range = initializer_range 66 | self.tie_weights_ = tie_weights_ 67 | self.qa_dropout = qa_dropout 68 | self.seq_classif_dropout = seq_classif_dropout 69 | 70 | @property 71 | def hidden_size(self): 72 | return self.dim 73 | 74 | @property 75 | def num_attention_heads(self): 76 | return self.n_heads 77 | 78 | @property 79 | def num_hidden_layers(self): 80 | return self.n_layers 81 | -------------------------------------------------------------------------------- /src/transformers/data/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | try: 18 | from scipy.stats import pearsonr, spearmanr 19 | from sklearn.metrics import matthews_corrcoef, f1_score 20 | 21 | _has_sklearn = True 22 | except (AttributeError, ImportError) as e: 23 | _has_sklearn = False 24 | 25 | 26 | def is_sklearn_available(): 27 | return _has_sklearn 28 | 29 | 30 | if _has_sklearn: 31 | 32 | def simple_accuracy(preds, labels): 33 | return (preds == labels).mean() 34 | 35 | def acc_and_f1(preds, labels): 36 | acc = simple_accuracy(preds, labels) 37 | f1 = f1_score(y_true=labels, y_pred=preds) 38 | return { 39 | "acc": acc, 40 | "f1": f1, 41 | "acc_and_f1": (acc + f1) / 2, 42 | } 43 | 44 | def pearson_and_spearman(preds, labels): 45 | pearson_corr = pearsonr(preds, labels)[0] 46 | spearman_corr = spearmanr(preds, labels)[0] 47 | return { 48 | "pearson": pearson_corr, 49 | "spearmanr": spearman_corr, 50 | "corr": (pearson_corr + spearman_corr) / 2, 51 | } 52 | 53 | def glue_compute_metrics(task_name, preds, labels): 54 | assert len(preds) == len(labels) 55 | if task_name == "cola": 56 | return {"mcc": matthews_corrcoef(labels, preds)} 57 | elif task_name == "sst-2": 58 | return {"acc": simple_accuracy(preds, labels)} 59 | elif task_name == "mrpc": 60 | return acc_and_f1(preds, labels) 61 | elif task_name == "sts-b": 62 | return pearson_and_spearman(preds, labels) 63 | elif task_name == "qqp": 64 | return acc_and_f1(preds, labels) 65 | elif task_name == "mnli": 66 | return {"acc": simple_accuracy(preds, labels)} 67 | elif task_name == "mnli-mm": 68 | return {"acc": simple_accuracy(preds, labels)} 69 | elif task_name == "qnli": 70 | return {"acc": simple_accuracy(preds, labels)} 71 | elif task_name == "rte": 72 | return {"acc": simple_accuracy(preds, labels)} 73 | elif task_name == "wnli": 74 | return {"acc": simple_accuracy(preds, labels)} 75 | else: 76 | raise KeyError(task_name) 77 | 78 | def xnli_compute_metrics(task_name, preds, labels): 79 | assert len(preds) == len(labels) 80 | if task_name == "xnli": 81 | return {"acc": simple_accuracy(preds, labels)} 82 | else: 83 | raise KeyError(task_name) 84 | -------------------------------------------------------------------------------- /tests/test_tokenization_gpt2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import json 18 | import os 19 | import unittest 20 | 21 | from transformers.tokenization_gpt2 import VOCAB_FILES_NAMES, GPT2Tokenizer 22 | 23 | from .test_tokenization_common import TokenizerTesterMixin 24 | 25 | 26 | class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): 27 | 28 | tokenizer_class = GPT2Tokenizer 29 | 30 | def setUp(self): 31 | super(GPT2TokenizationTest, self).setUp() 32 | 33 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 34 | vocab = [ 35 | "l", 36 | "o", 37 | "w", 38 | "e", 39 | "r", 40 | "s", 41 | "t", 42 | "i", 43 | "d", 44 | "n", 45 | "\u0120", 46 | "\u0120l", 47 | "\u0120n", 48 | "\u0120lo", 49 | "\u0120low", 50 | "er", 51 | "\u0120lowest", 52 | "\u0120newer", 53 | "\u0120wider", 54 | "", 55 | ] 56 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 57 | merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] 58 | self.special_tokens_map = {"unk_token": ""} 59 | 60 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) 61 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) 62 | with open(self.vocab_file, "w", encoding="utf-8") as fp: 63 | fp.write(json.dumps(vocab_tokens) + "\n") 64 | with open(self.merges_file, "w", encoding="utf-8") as fp: 65 | fp.write("\n".join(merges)) 66 | 67 | def get_tokenizer(self, **kwargs): 68 | kwargs.update(self.special_tokens_map) 69 | return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs) 70 | 71 | def get_input_output_texts(self): 72 | input_text = "lower newer" 73 | output_text = "lower newer" 74 | return input_text, output_text 75 | 76 | def test_full_tokenizer(self): 77 | tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) 78 | text = "lower newer" 79 | bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"] 80 | tokens = tokenizer.tokenize(text, add_prefix_space=True) 81 | self.assertListEqual(tokens, bpe_tokens) 82 | 83 | input_tokens = tokens + [tokenizer.unk_token] 84 | input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] 85 | self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 86 | -------------------------------------------------------------------------------- /src/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import json 20 | import logging 21 | 22 | import numpy 23 | import torch 24 | 25 | from transformers import CONFIG_NAME, WEIGHTS_NAME 26 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES 27 | 28 | 29 | logging.basicConfig(level=logging.INFO) 30 | 31 | 32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path): 33 | # Load checkpoint 34 | chkpt = torch.load(xlm_checkpoint_path, map_location="cpu") 35 | 36 | state_dict = chkpt["model"] 37 | 38 | # We have the base model one level deeper than the original XLM repository 39 | two_levels_state_dict = {} 40 | for k, v in state_dict.items(): 41 | if "pred_layer" in k: 42 | two_levels_state_dict[k] = v 43 | else: 44 | two_levels_state_dict["transformer." + k] = v 45 | 46 | config = chkpt["params"] 47 | config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))) 48 | 49 | vocab = chkpt["dico_word2id"] 50 | vocab = dict((s + "" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items()) 51 | 52 | # Save pytorch-model 53 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 54 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 55 | pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"] 56 | 57 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 58 | torch.save(two_levels_state_dict, pytorch_weights_dump_path) 59 | 60 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 61 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 62 | f.write(json.dumps(config, indent=2) + "\n") 63 | 64 | print("Save vocab file to {}".format(pytorch_config_dump_path)) 65 | with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f: 66 | f.write(json.dumps(vocab, indent=2) + "\n") 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser() 71 | # Required parameters 72 | parser.add_argument( 73 | "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." 74 | ) 75 | parser.add_argument( 76 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 77 | ) 78 | args = parser.parse_args() 79 | convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path) 80 | -------------------------------------------------------------------------------- /src/transformers/data/processors/xnli.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XNLI utils (dataset loading and evaluation) """ 17 | 18 | 19 | import logging 20 | import os 21 | 22 | from .utils import DataProcessor, InputExample 23 | 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | class XnliProcessor(DataProcessor): 29 | """Processor for the XNLI dataset. 30 | Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207""" 31 | 32 | def __init__(self, language, train_language=None): 33 | self.language = language 34 | self.train_language = train_language 35 | 36 | def get_train_examples(self, data_dir): 37 | """See base class.""" 38 | lg = self.language if self.train_language is None else self.train_language 39 | lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg))) 40 | examples = [] 41 | for (i, line) in enumerate(lines): 42 | if i == 0: 43 | continue 44 | guid = "%s-%s" % ("train", i) 45 | text_a = line[0] 46 | text_b = line[1] 47 | label = "contradiction" if line[2] == "contradictory" else line[2] 48 | assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) 49 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 50 | return examples 51 | 52 | def get_test_examples(self, data_dir): 53 | """See base class.""" 54 | lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv")) 55 | examples = [] 56 | for (i, line) in enumerate(lines): 57 | if i == 0: 58 | continue 59 | language = line[0] 60 | if language != self.language: 61 | continue 62 | guid = "%s-%s" % ("test", i) 63 | text_a = line[6] 64 | text_b = line[7] 65 | label = line[1] 66 | assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) 67 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 68 | return examples 69 | 70 | def get_labels(self): 71 | """See base class.""" 72 | return ["contradiction", "entailment", "neutral"] 73 | 74 | 75 | xnli_processors = { 76 | "xnli": XnliProcessor, 77 | } 78 | 79 | xnli_output_modes = { 80 | "xnli": "classification", 81 | } 82 | 83 | xnli_tasks_num_labels = { 84 | "xnli": 3, 85 | } 86 | -------------------------------------------------------------------------------- /tests/test_tokenization_albert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 Hugging Face inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import os 18 | import unittest 19 | 20 | from transformers.tokenization_albert import AlbertTokenizer 21 | 22 | from .test_tokenization_common import TokenizerTesterMixin 23 | 24 | 25 | SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model") 26 | 27 | 28 | class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): 29 | 30 | tokenizer_class = AlbertTokenizer 31 | 32 | def setUp(self): 33 | super(AlbertTokenizationTest, self).setUp() 34 | 35 | # We have a SentencePiece fixture for testing 36 | tokenizer = AlbertTokenizer(SAMPLE_VOCAB) 37 | tokenizer.save_pretrained(self.tmpdirname) 38 | 39 | def get_tokenizer(self, **kwargs): 40 | return AlbertTokenizer.from_pretrained(self.tmpdirname, **kwargs) 41 | 42 | def get_input_output_texts(self): 43 | input_text = "this is a test" 44 | output_text = "this is a test" 45 | return input_text, output_text 46 | 47 | def test_full_tokenizer(self): 48 | tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True) 49 | 50 | tokens = tokenizer.tokenize("This is a test") 51 | self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"]) 52 | 53 | self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289]) 54 | 55 | tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") 56 | self.assertListEqual( 57 | tokens, ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."] 58 | ) 59 | ids = tokenizer.convert_tokens_to_ids(tokens) 60 | self.assertListEqual(ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9]) 61 | 62 | back_tokens = tokenizer.convert_ids_to_tokens(ids) 63 | self.assertListEqual( 64 | back_tokens, 65 | ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "."], 66 | ) 67 | 68 | def test_sequence_builders(self): 69 | tokenizer = AlbertTokenizer(SAMPLE_VOCAB) 70 | 71 | text = tokenizer.encode("sequence builders") 72 | text_2 = tokenizer.encode("multi-sequence build") 73 | 74 | encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) 75 | encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) 76 | 77 | assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] 78 | assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [ 79 | tokenizer.sep_token_id 80 | ] 81 | -------------------------------------------------------------------------------- /examples/configuration/configuration_bert.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import BertConfig 3 | 4 | class BertForDependencyParsingConfig(BertConfig): 5 | def __init__( 6 | self, 7 | vocab_size=30522, 8 | hidden_size=768, 9 | num_hidden_layers=12, 10 | num_attention_heads=12, 11 | intermediate_size=3072, 12 | hidden_act="gelu", 13 | hidden_dropout_prob=0.1, 14 | attention_probs_dropout_prob=0.1, 15 | max_position_embeddings=512, 16 | type_vocab_size=2, 17 | initializer_range=0.02, 18 | layer_norm_eps=1e-12, 19 | use_postag=False, 20 | num_postags=0, 21 | num_labels=0, 22 | arc_space=512, 23 | label_space=128, 24 | **kwargs 25 | ): 26 | super(BertForDependencyParsingConfig, self).__init__( 27 | vocab_size=vocab_size, 28 | hidden_size=hidden_size, 29 | num_hidden_layers=num_hidden_layers, 30 | num_attention_heads=num_attention_heads, 31 | intermediate_size=intermediate_size, 32 | hidden_act=hidden_act, 33 | hidden_dropout_prob=hidden_dropout_prob, 34 | attention_probs_dropout_prob=attention_probs_dropout_prob, 35 | max_position_embeddings=max_position_embeddings, 36 | type_vocab_size=type_vocab_size, 37 | initializer_range=initializer_range, 38 | layer_norm_eps=layer_norm_eps, 39 | **kwargs 40 | ) 41 | self.use_postag = use_postag 42 | self.num_postags = num_postags 43 | self.num_labels = num_labels 44 | self.arc_space = arc_space 45 | self.label_space = label_space 46 | 47 | 48 | class BertForDependencyParsingWithOrderConfig(BertConfig): 49 | def __init__( 50 | self, 51 | vocab_size=30522, 52 | hidden_size=768, 53 | num_hidden_layers=12, 54 | num_attention_heads=12, 55 | intermediate_size=3072, 56 | hidden_act="gelu", 57 | hidden_dropout_prob=0.1, 58 | attention_probs_dropout_prob=0.1, 59 | max_position_embeddings=512, 60 | type_vocab_size=2, 61 | initializer_range=0.02, 62 | layer_norm_eps=1e-12, 63 | use_postag=False, 64 | num_postags=0, 65 | num_labels=0, 66 | arc_space=512, 67 | label_space=128, 68 | max_parsing_order=32, 69 | order_space=128, 70 | **kwargs 71 | ): 72 | super(BertForDependencyParsingWithOrderConfig, self).__init__( 73 | vocab_size=vocab_size, 74 | hidden_size=hidden_size, 75 | num_hidden_layers=num_hidden_layers, 76 | num_attention_heads=num_attention_heads, 77 | intermediate_size=intermediate_size, 78 | hidden_act=hidden_act, 79 | hidden_dropout_prob=hidden_dropout_prob, 80 | attention_probs_dropout_prob=attention_probs_dropout_prob, 81 | max_position_embeddings=max_position_embeddings, 82 | type_vocab_size=type_vocab_size, 83 | initializer_range=initializer_range, 84 | layer_norm_eps=layer_norm_eps, 85 | **kwargs 86 | ) 87 | self.use_postag = use_postag 88 | self.num_postags = num_postags 89 | self.num_labels = num_labels 90 | self.arc_space = arc_space 91 | self.label_space = label_space 92 | self.max_parsing_order = max_parsing_order 93 | self.order_space = order_space -------------------------------------------------------------------------------- /valohai.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - step: 4 | name: Execute python examples/run_glue.py 5 | image: pytorch/pytorch:nightly-devel-cuda10.0-cudnn7 6 | command: 7 | - python /valohai/repository/utils/download_glue_data.py --data_dir=/glue_data 8 | - pip install -e . 9 | - pip install -r examples/requirements.txt 10 | - python examples/run_glue.py --do_train --data_dir=/glue_data/{parameter-value:task_name} {parameters} 11 | parameters: 12 | - name: model_type 13 | pass-as: --model_type={v} 14 | type: string 15 | default: bert 16 | - name: model_name_or_path 17 | pass-as: --model_name_or_path={v} 18 | type: string 19 | default: bert-base-uncased 20 | - name: task_name 21 | pass-as: --task_name={v} 22 | type: string 23 | default: MRPC 24 | - name: max_seq_length 25 | pass-as: --max_seq_length={v} 26 | description: The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded. 27 | type: integer 28 | default: 128 29 | - name: per_gpu_train_batch_size 30 | pass-as: --per_gpu_train_batch_size={v} 31 | description: Batch size per GPU/CPU for training. 32 | type: integer 33 | default: 8 34 | - name: per_gpu_eval_batch_size 35 | pass-as: --per_gpu_eval_batch_size={v} 36 | description: Batch size per GPU/CPU for evaluation. 37 | type: integer 38 | default: 8 39 | - name: gradient_accumulation_steps 40 | pass-as: --gradient_accumulation_steps={v} 41 | description: Number of updates steps to accumulate before performing a backward/update pass. 42 | type: integer 43 | default: 1 44 | - name: learning_rate 45 | pass-as: --learning_rate={v} 46 | description: The initial learning rate for Adam. 47 | type: float 48 | default: 0.00005 49 | - name: adam_epsilon 50 | pass-as: --adam_epsilon={v} 51 | description: Epsilon for Adam optimizer. 52 | type: float 53 | default: 0.00000001 54 | - name: max_grad_norm 55 | pass-as: --max_grad_norm={v} 56 | description: Max gradient norm. 57 | type: float 58 | default: 1.0 59 | - name: num_train_epochs 60 | pass-as: --num_train_epochs={v} 61 | description: Total number of training epochs to perform. 62 | type: integer 63 | default: 3 64 | - name: max_steps 65 | pass-as: --max_steps={v} 66 | description: If > 0, set total number of training steps to perform. Override num_train_epochs. 67 | type: integer 68 | default: -1 69 | - name: warmup_steps 70 | pass-as: --warmup_steps={v} 71 | description: Linear warmup over warmup_steps. 72 | type: integer 73 | default: -1 74 | - name: logging_steps 75 | pass-as: --logging_steps={v} 76 | description: Log every X updates steps. 77 | type: integer 78 | default: 25 79 | - name: save_steps 80 | pass-as: --save_steps={v} 81 | description: Save checkpoint every X updates steps. 82 | type: integer 83 | default: -1 84 | - name: output_dir 85 | pass-as: --output_dir={v} 86 | type: string 87 | default: /valohai/outputs 88 | - name: evaluate_during_training 89 | description: Run evaluation during training at each logging step. 90 | type: flag 91 | default: true 92 | - name: do_lower_case 93 | description: Set this flag if you are using an uncased model. 94 | type: flag 95 | -------------------------------------------------------------------------------- /tests/test_model_card.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 HuggingFace Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import json 18 | import os 19 | import tempfile 20 | import unittest 21 | 22 | from transformers.modelcard import ModelCard 23 | 24 | 25 | class ModelCardTester(unittest.TestCase): 26 | def setUp(self): 27 | self.inputs_dict = { 28 | "model_details": { 29 | "Organization": "testing", 30 | "Model date": "today", 31 | "Model version": "v2.1, Developed by Test Corp in 2019.", 32 | "Architecture": "Convolutional Neural Network.", 33 | }, 34 | "metrics": "BLEU and ROUGE-1", 35 | "evaluation_data": { 36 | "Datasets": {"BLEU": "My-great-dataset-v1", "ROUGE-1": "My-short-dataset-v2.1"}, 37 | "Preprocessing": "See details on https://arxiv.org/pdf/1810.03993.pdf", 38 | }, 39 | "training_data": { 40 | "Dataset": "English Wikipedia dump dated 2018-12-01", 41 | "Preprocessing": "Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf", 42 | }, 43 | "quantitative_analyses": {"BLEU": 55.1, "ROUGE-1": 76}, 44 | } 45 | 46 | def test_model_card_common_properties(self): 47 | modelcard = ModelCard.from_dict(self.inputs_dict) 48 | self.assertTrue(hasattr(modelcard, "model_details")) 49 | self.assertTrue(hasattr(modelcard, "intended_use")) 50 | self.assertTrue(hasattr(modelcard, "factors")) 51 | self.assertTrue(hasattr(modelcard, "metrics")) 52 | self.assertTrue(hasattr(modelcard, "evaluation_data")) 53 | self.assertTrue(hasattr(modelcard, "training_data")) 54 | self.assertTrue(hasattr(modelcard, "quantitative_analyses")) 55 | self.assertTrue(hasattr(modelcard, "ethical_considerations")) 56 | self.assertTrue(hasattr(modelcard, "caveats_and_recommendations")) 57 | 58 | def test_model_card_to_json_string(self): 59 | modelcard = ModelCard.from_dict(self.inputs_dict) 60 | obj = json.loads(modelcard.to_json_string()) 61 | for key, value in self.inputs_dict.items(): 62 | self.assertEqual(obj[key], value) 63 | 64 | def test_model_card_to_json_file(self): 65 | model_card_first = ModelCard.from_dict(self.inputs_dict) 66 | 67 | with tempfile.TemporaryDirectory() as tmpdirname: 68 | filename = os.path.join(tmpdirname, "modelcard.json") 69 | model_card_first.to_json_file(filename) 70 | model_card_second = ModelCard.from_json_file(filename) 71 | 72 | self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict()) 73 | 74 | def test_model_card_from_and_save_pretrained(self): 75 | model_card_first = ModelCard.from_dict(self.inputs_dict) 76 | 77 | with tempfile.TemporaryDirectory() as tmpdirname: 78 | model_card_first.save_pretrained(tmpdirname) 79 | model_card_second = ModelCard.from_pretrained(tmpdirname) 80 | 81 | self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict()) 82 | -------------------------------------------------------------------------------- /src/transformers/convert_kcnet_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert KCNet checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import json 21 | from io import open 22 | 23 | import torch 24 | import numpy 25 | 26 | from transformers import CONFIG_NAME, WEIGHTS_NAME 27 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES 28 | 29 | import logging 30 | logging.basicConfig(level=logging.INFO) 31 | 32 | def convert_kcnet_checkpoint_to_pytorch(kcnet_checkpoint_path, pytorch_dump_folder_path): 33 | # Load checkpoint 34 | chkpt = torch.load(kcnet_checkpoint_path, map_location='cpu') 35 | 36 | if 'model' in chkpt.keys(): 37 | state_dict = chkpt['model'] 38 | else: 39 | state_dict = chkpt['encoder'] 40 | 41 | state_dict = {(k[7:] if k.startswith('module.') else k): v for k, v in state_dict.items()} 42 | 43 | # We have the base model one level deeper than the original XLM repository 44 | two_levels_state_dict = {} 45 | for k, v in state_dict.items(): 46 | if 'pred_layer' in k: 47 | two_levels_state_dict[k] = v 48 | else: 49 | two_levels_state_dict['transformer.' + k] = v 50 | 51 | config = chkpt['params'] 52 | config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))) 53 | 54 | vocab = chkpt['dico_word2id'] 55 | vocab = dict((s + '' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items()) 56 | 57 | # Save pytorch-model 58 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 59 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 60 | pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['vocab_file'] 61 | 62 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 63 | torch.save(two_levels_state_dict, pytorch_weights_dump_path) 64 | 65 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 66 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 67 | f.write(json.dumps(config, indent=2) + "\n") 68 | 69 | print("Save vocab file to {}".format(pytorch_vocab_dump_path)) 70 | with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f: 71 | f.write(json.dumps(vocab, indent=2) + "\n") 72 | 73 | 74 | if __name__ == "__main__": 75 | parser = argparse.ArgumentParser() 76 | ## Required parameters 77 | parser.add_argument("--kcnet_checkpoint_path", 78 | default = None, 79 | type = str, 80 | required = True, 81 | help = "Path the official PyTorch dump.") 82 | parser.add_argument("--pytorch_dump_folder_path", 83 | default = None, 84 | type = str, 85 | required = True, 86 | help = "Path to the output PyTorch model.") 87 | args = parser.parse_args() 88 | convert_kcnet_checkpoint_to_pytorch(args.kcnet_checkpoint_path, args.pytorch_dump_folder_path) 89 | -------------------------------------------------------------------------------- /tests/test_tokenization_xlm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import json 18 | import os 19 | import unittest 20 | 21 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES, XLMTokenizer 22 | 23 | from .test_tokenization_common import TokenizerTesterMixin 24 | from .utils import slow 25 | 26 | 27 | class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): 28 | 29 | tokenizer_class = XLMTokenizer 30 | 31 | def setUp(self): 32 | super(XLMTokenizationTest, self).setUp() 33 | 34 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 35 | vocab = [ 36 | "l", 37 | "o", 38 | "w", 39 | "e", 40 | "r", 41 | "s", 42 | "t", 43 | "i", 44 | "d", 45 | "n", 46 | "w", 47 | "r", 48 | "t", 49 | "lo", 50 | "low", 51 | "er", 52 | "low", 53 | "lowest", 54 | "newer", 55 | "wider", 56 | "", 57 | ] 58 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 59 | merges = ["l o 123", "lo w 1456", "e r 1789", ""] 60 | 61 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) 62 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) 63 | with open(self.vocab_file, "w") as fp: 64 | fp.write(json.dumps(vocab_tokens)) 65 | with open(self.merges_file, "w") as fp: 66 | fp.write("\n".join(merges)) 67 | 68 | def get_tokenizer(self, **kwargs): 69 | return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs) 70 | 71 | def get_input_output_texts(self): 72 | input_text = "lower newer" 73 | output_text = "lower newer" 74 | return input_text, output_text 75 | 76 | def test_full_tokenizer(self): 77 | """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """ 78 | tokenizer = XLMTokenizer(self.vocab_file, self.merges_file) 79 | 80 | text = "lower" 81 | bpe_tokens = ["low", "er"] 82 | tokens = tokenizer.tokenize(text) 83 | self.assertListEqual(tokens, bpe_tokens) 84 | 85 | input_tokens = tokens + [""] 86 | input_bpe_tokens = [14, 15, 20] 87 | self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 88 | 89 | @slow 90 | def test_sequence_builders(self): 91 | tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048") 92 | 93 | text = tokenizer.encode("sequence builders", add_special_tokens=False) 94 | text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) 95 | 96 | encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) 97 | encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) 98 | 99 | assert encoded_sentence == [1] + text + [1] 100 | assert encoded_pair == [1] + text + [1] + text_2 + [1] 101 | -------------------------------------------------------------------------------- /tests/test_hf_api.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import os 18 | import time 19 | import unittest 20 | 21 | import requests 22 | from requests.exceptions import HTTPError 23 | 24 | from transformers.hf_api import HfApi, HfFolder, PresignedUrl, S3Obj 25 | 26 | 27 | USER = "__DUMMY_TRANSFORMERS_USER__" 28 | PASS = "__DUMMY_TRANSFORMERS_PASS__" 29 | FILES = [ 30 | ( 31 | "Test-{}.txt".format(int(time.time())), 32 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"), 33 | ), 34 | ( 35 | "yoyo {}.txt".format(int(time.time())), # space is intentional 36 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"), 37 | ), 38 | ] 39 | 40 | 41 | class HfApiCommonTest(unittest.TestCase): 42 | _api = HfApi(endpoint="https://moon-staging.huggingface.co") 43 | 44 | 45 | class HfApiLoginTest(HfApiCommonTest): 46 | def test_login_invalid(self): 47 | with self.assertRaises(HTTPError): 48 | self._api.login(username=USER, password="fake") 49 | 50 | def test_login_valid(self): 51 | token = self._api.login(username=USER, password=PASS) 52 | self.assertIsInstance(token, str) 53 | 54 | 55 | class HfApiEndpointsTest(HfApiCommonTest): 56 | @classmethod 57 | def setUpClass(cls): 58 | """ 59 | Share this valid token in all tests below. 60 | """ 61 | cls._token = cls._api.login(username=USER, password=PASS) 62 | 63 | def test_whoami(self): 64 | user = self._api.whoami(token=self._token) 65 | self.assertEqual(user, USER) 66 | 67 | def test_presign(self): 68 | for FILE_KEY, FILE_PATH in FILES: 69 | urls = self._api.presign(token=self._token, filename=FILE_KEY) 70 | self.assertIsInstance(urls, PresignedUrl) 71 | self.assertEqual(urls.type, "text/plain") 72 | 73 | def test_presign_and_upload(self): 74 | for FILE_KEY, FILE_PATH in FILES: 75 | access_url = self._api.presign_and_upload(token=self._token, filename=FILE_KEY, filepath=FILE_PATH) 76 | self.assertIsInstance(access_url, str) 77 | with open(FILE_PATH, "r") as f: 78 | body = f.read() 79 | r = requests.get(access_url) 80 | self.assertEqual(r.text, body) 81 | 82 | def test_list_objs(self): 83 | objs = self._api.list_objs(token=self._token) 84 | self.assertIsInstance(objs, list) 85 | if len(objs) > 0: 86 | o = objs[-1] 87 | self.assertIsInstance(o, S3Obj) 88 | 89 | 90 | class HfFolderTest(unittest.TestCase): 91 | def test_token_workflow(self): 92 | """ 93 | Test the whole token save/get/delete workflow, 94 | with the desired behavior with respect to non-existent tokens. 95 | """ 96 | token = "token-{}".format(int(time.time())) 97 | HfFolder.save_token(token) 98 | self.assertEqual(HfFolder.get_token(), token) 99 | HfFolder.delete_token() 100 | HfFolder.delete_token() 101 | # ^^ not an error, we test that the 102 | # second call does not fail. 103 | self.assertEqual(HfFolder.get_token(), None) 104 | -------------------------------------------------------------------------------- /tests/test_optimization_tf.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from transformers import is_tf_available 4 | 5 | from .utils import require_tf 6 | 7 | 8 | if is_tf_available(): 9 | import tensorflow as tf 10 | from tensorflow.python.eager import context 11 | from tensorflow.python.framework import ops 12 | from transformers import create_optimizer, GradientAccumulator 13 | 14 | 15 | @require_tf 16 | class OptimizationFTest(unittest.TestCase): 17 | def assertListAlmostEqual(self, list1, list2, tol): 18 | self.assertEqual(len(list1), len(list2)) 19 | for a, b in zip(list1, list2): 20 | self.assertAlmostEqual(a, b, delta=tol) 21 | 22 | def testGradientAccumulator(self): 23 | accumulator = GradientAccumulator() 24 | accumulator([tf.constant([1.0, 2.0])]) 25 | accumulator([tf.constant([-2.0, 1.0])]) 26 | accumulator([tf.constant([-1.0, 2.0])]) 27 | with self.assertRaises(ValueError): 28 | accumulator([tf.constant([1.0, 1.0]), tf.constant([2.0, 2.0])]) 29 | self.assertEqual(accumulator.step, 3) 30 | self.assertEqual(len(accumulator.gradients), 1) 31 | self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [-2.0, 5.0], tol=1e-2) 32 | accumulator.reset() 33 | self.assertEqual(accumulator.step, 0) 34 | self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [0.0, 0.0], tol=1e-2) 35 | 36 | def testGradientAccumulatorDistributionStrategy(self): 37 | context._context = None 38 | ops.enable_eager_execution_internal() 39 | physical_devices = tf.config.experimental.list_physical_devices("CPU") 40 | tf.config.experimental.set_virtual_device_configuration( 41 | physical_devices[0], 42 | [tf.config.experimental.VirtualDeviceConfiguration(), tf.config.experimental.VirtualDeviceConfiguration()], 43 | ) 44 | 45 | devices = tf.config.experimental.list_logical_devices(device_type="CPU") 46 | strategy = tf.distribute.MirroredStrategy(devices=[device.name for device in devices]) 47 | 48 | with strategy.scope(): 49 | accumulator = GradientAccumulator() 50 | variable = tf.Variable([4.0, 3.0]) 51 | optimizer = create_optimizer(5e-5, 10, 5) 52 | gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False) 53 | 54 | def accumulate_on_replica(gradient): 55 | accumulator([gradient]) 56 | 57 | def apply_on_replica(): 58 | optimizer.apply_gradients(list(zip(accumulator.gradients, [variable])), 1.0) 59 | 60 | @tf.function 61 | def accumulate(grad1, grad2): 62 | with strategy.scope(): 63 | gradient_placeholder.values[0].assign(grad1) 64 | gradient_placeholder.values[1].assign(grad2) 65 | strategy.experimental_run_v2(accumulate_on_replica, args=(gradient_placeholder,)) 66 | 67 | @tf.function 68 | def apply_grad(): 69 | with strategy.scope(): 70 | strategy.experimental_run_v2(apply_on_replica) 71 | 72 | accumulate([1.0, 2.0], [-1.0, 1.0]) 73 | accumulate([3.0, -1.0], [-1.0, -1.0]) 74 | accumulate([-2.0, 2.0], [3.0, -2.0]) 75 | self.assertEqual(accumulator.step, 3) 76 | self.assertListAlmostEqual(accumulator._gradients[0].values[0].value().numpy().tolist(), [2.0, 3.0], tol=1e-2) 77 | self.assertListAlmostEqual(accumulator._gradients[0].values[1].value().numpy().tolist(), [1.0, -2.0], tol=1e-2) 78 | apply_grad() 79 | self.assertListAlmostEqual(variable.value().numpy().tolist(), [4.0, 3.0], tol=1e-2) 80 | accumulator.reset() 81 | self.assertEqual(accumulator.step, 0) 82 | self.assertListAlmostEqual(accumulator._gradients[0].values[0].value().numpy().tolist(), [0.0, 0.0], tol=1e-2) 83 | self.assertListAlmostEqual(accumulator._gradients[0].values[1].value().numpy().tolist(), [0.0, 0.0], tol=1e-2) 84 | -------------------------------------------------------------------------------- /templates/adding_a_new_model/README.md: -------------------------------------------------------------------------------- 1 | # How to add a new model in 🤗Transformers 2 | 3 | This folder describes the process to add a new model in 🤗Transformers and provide templates for the required files. 4 | 5 | The library is designed to incorporate a variety of models and code bases. As such the process for adding a new model usually mostly consists in copy-pasting to relevant original code in the various sections of the templates included in the present repository. 6 | 7 | One important point though is that the library has the following goals impacting the way models are incorporated: 8 | 9 | - one specific feature of the API is the capability to run the model and tokenizer inline. The tokenization code thus often have to be slightly adapted to allow for running in the python interpreter. 10 | - the package is also designed to be as self-consistent and with a small and reliable set of packages dependencies. In consequence, additional dependencies are usually not allowed when adding a model but can be allowed for the inclusion of a new tokenizer (recent examples of dependencies added for tokenizer specificities include `sentencepiece` and `sacremoses`). Please make sure to check the existing dependencies when possible before adding a new one. 11 | 12 | For a quick overview of the library organization, please check the [QuickStart section of the documentation](https://huggingface.co/transformers/quickstart.html). 13 | 14 | # Typical workflow for including a model 15 | 16 | Here an overview of the general workflow: 17 | 18 | - [ ] add model/configuration/tokenization classes 19 | - [ ] add conversion scripts 20 | - [ ] add tests 21 | - [ ] finalize 22 | 23 | Let's detail what should be done at each step 24 | 25 | ## Adding model/configuration/tokenization classes 26 | 27 | Here is the workflow for adding model/configuration/tokenization classes: 28 | 29 | - [ ] copy the python files from the present folder to the main folder and rename them, replacing `xxx` with your model name, 30 | - [ ] edit the files to replace `XXX` (with various casing) with your model name 31 | - [ ] copy-paste or create a simple configuration class for your model in the `configuration_...` file 32 | - [ ] copy-paste or create the code for your model in the `modeling_...` files (PyTorch and TF 2.0) 33 | - [ ] copy-paste or create a tokenizer class for your model in the `tokenization_...` file 34 | 35 | # Adding conversion scripts 36 | 37 | Here is the workflow for the conversion scripts: 38 | 39 | - [ ] copy the conversion script (`convert_...`) from the present folder to the main folder. 40 | - [ ] edit this script to convert your original checkpoint weights to the current pytorch ones. 41 | 42 | # Adding tests: 43 | 44 | Here is the workflow for the adding tests: 45 | 46 | - [ ] copy the python files from the `tests` sub-folder of the present folder to the `tests` subfolder of the main folder and rename them, replacing `xxx` with your model name, 47 | - [ ] edit the tests files to replace `XXX` (with various casing) with your model name 48 | - [ ] edit the tests code as needed 49 | 50 | # Final steps 51 | 52 | You can then finish the addition step by adding imports for your classes in the common files: 53 | 54 | - [ ] add import for all the relevant classes in `__init__.py` 55 | - [ ] add your configuration in `configuration_auto.py` 56 | - [ ] add your PyTorch and TF 2.0 model respectively in `modeling_auto.py` and `modeling_tf_auto.py` 57 | - [ ] add your tokenizer in `tokenization_auto.py` 58 | - [ ] add your models and tokenizer to `pipeline.py` 59 | - [ ] add a link to your conversion script in the main conversion utility (currently in `__main__` but will be moved to the `commands` subfolder in the near future) 60 | - [ ] edit the PyTorch to TF 2.0 conversion script to add your model in the `convert_pytorch_checkpoint_to_tf2.py` file 61 | - [ ] add a mention of your model in the doc: `README.md` and the documentation itself at `docs/source/pretrained_models.rst`. 62 | - [ ] upload the pretrained weigths, configurations and vocabulary files. 63 | -------------------------------------------------------------------------------- /src/transformers/commands/run.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from argparse import ArgumentParser 3 | 4 | from transformers.commands import BaseTransformersCLICommand 5 | from transformers.pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline 6 | 7 | 8 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 9 | 10 | 11 | def try_infer_format_from_ext(path: str): 12 | if not path: 13 | return "pipe" 14 | 15 | for ext in PipelineDataFormat.SUPPORTED_FORMATS: 16 | if path.endswith(ext): 17 | return ext 18 | 19 | raise Exception( 20 | "Unable to determine file format from file extension {}. " 21 | "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS) 22 | ) 23 | 24 | 25 | def run_command_factory(args): 26 | nlp = pipeline( 27 | task=args.task, 28 | model=args.model if args.model else None, 29 | config=args.config, 30 | tokenizer=args.tokenizer, 31 | device=args.device, 32 | ) 33 | format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format 34 | reader = PipelineDataFormat.from_str( 35 | format=format, 36 | output_path=args.output, 37 | input_path=args.input, 38 | column=args.column if args.column else nlp.default_input_names, 39 | overwrite=args.overwrite, 40 | ) 41 | return RunCommand(nlp, reader) 42 | 43 | 44 | class RunCommand(BaseTransformersCLICommand): 45 | def __init__(self, nlp: Pipeline, reader: PipelineDataFormat): 46 | self._nlp = nlp 47 | self._reader = reader 48 | 49 | @staticmethod 50 | def register_subcommand(parser: ArgumentParser): 51 | run_parser = parser.add_parser("run", help="Run a pipeline through the CLI") 52 | run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run") 53 | run_parser.add_argument("--input", type=str, help="Path to the file to use for inference") 54 | run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.") 55 | run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.") 56 | run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.") 57 | run_parser.add_argument( 58 | "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)" 59 | ) 60 | run_parser.add_argument( 61 | "--column", 62 | type=str, 63 | help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)", 64 | ) 65 | run_parser.add_argument( 66 | "--format", 67 | type=str, 68 | default="infer", 69 | choices=PipelineDataFormat.SUPPORTED_FORMATS, 70 | help="Input format to read from", 71 | ) 72 | run_parser.add_argument( 73 | "--device", 74 | type=int, 75 | default=-1, 76 | help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)", 77 | ) 78 | run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.") 79 | run_parser.set_defaults(func=run_command_factory) 80 | 81 | def run(self): 82 | nlp, outputs = self._nlp, [] 83 | 84 | for entry in self._reader: 85 | output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry) 86 | if isinstance(output, dict): 87 | outputs.append(output) 88 | else: 89 | outputs += output 90 | 91 | # Saving data 92 | if self._nlp.binary_output: 93 | binary_path = self._reader.save_binary(outputs) 94 | logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path)) 95 | else: 96 | self._reader.save(outputs) 97 | -------------------------------------------------------------------------------- /tests/test_tokenization_t5.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Google T5 Authors and HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import os 18 | import unittest 19 | 20 | from transformers.tokenization_t5 import T5Tokenizer 21 | from transformers.tokenization_xlnet import SPIECE_UNDERLINE 22 | 23 | from .test_tokenization_common import TokenizerTesterMixin 24 | 25 | 26 | SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") 27 | 28 | 29 | class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): 30 | 31 | tokenizer_class = T5Tokenizer 32 | 33 | def setUp(self): 34 | super(T5TokenizationTest, self).setUp() 35 | 36 | # We have a SentencePiece fixture for testing 37 | tokenizer = T5Tokenizer(SAMPLE_VOCAB) 38 | tokenizer.save_pretrained(self.tmpdirname) 39 | 40 | def get_tokenizer(self, **kwargs): 41 | return T5Tokenizer.from_pretrained(self.tmpdirname, **kwargs) 42 | 43 | def get_input_output_texts(self): 44 | input_text = "This is a test" 45 | output_text = "This is a test" 46 | return input_text, output_text 47 | 48 | def test_full_tokenizer(self): 49 | tokenizer = T5Tokenizer(SAMPLE_VOCAB) 50 | 51 | tokens = tokenizer.tokenize("This is a test") 52 | self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) 53 | 54 | self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) 55 | 56 | tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") 57 | self.assertListEqual( 58 | tokens, 59 | [ 60 | SPIECE_UNDERLINE + "I", 61 | SPIECE_UNDERLINE + "was", 62 | SPIECE_UNDERLINE + "b", 63 | "or", 64 | "n", 65 | SPIECE_UNDERLINE + "in", 66 | SPIECE_UNDERLINE + "", 67 | "9", 68 | "2", 69 | "0", 70 | "0", 71 | "0", 72 | ",", 73 | SPIECE_UNDERLINE + "and", 74 | SPIECE_UNDERLINE + "this", 75 | SPIECE_UNDERLINE + "is", 76 | SPIECE_UNDERLINE + "f", 77 | "al", 78 | "s", 79 | "é", 80 | ".", 81 | ], 82 | ) 83 | ids = tokenizer.convert_tokens_to_ids(tokens) 84 | self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4]) 85 | 86 | back_tokens = tokenizer.convert_ids_to_tokens(ids) 87 | self.assertListEqual( 88 | back_tokens, 89 | [ 90 | SPIECE_UNDERLINE + "I", 91 | SPIECE_UNDERLINE + "was", 92 | SPIECE_UNDERLINE + "b", 93 | "or", 94 | "n", 95 | SPIECE_UNDERLINE + "in", 96 | SPIECE_UNDERLINE + "", 97 | "", 98 | "2", 99 | "0", 100 | "0", 101 | "0", 102 | ",", 103 | SPIECE_UNDERLINE + "and", 104 | SPIECE_UNDERLINE + "this", 105 | SPIECE_UNDERLINE + "is", 106 | SPIECE_UNDERLINE + "f", 107 | "al", 108 | "s", 109 | "", 110 | ".", 111 | ], 112 | ) 113 | -------------------------------------------------------------------------------- /src/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | import os 21 | 22 | import torch 23 | 24 | from transformers import ( 25 | CONFIG_NAME, 26 | WEIGHTS_NAME, 27 | XLNetConfig, 28 | XLNetForQuestionAnswering, 29 | XLNetForSequenceClassification, 30 | XLNetLMHeadModel, 31 | load_tf_weights_in_xlnet, 32 | ) 33 | 34 | 35 | GLUE_TASKS_NUM_LABELS = { 36 | "cola": 2, 37 | "mnli": 3, 38 | "mrpc": 2, 39 | "sst-2": 2, 40 | "sts-b": 1, 41 | "qqp": 2, 42 | "qnli": 2, 43 | "rte": 2, 44 | "wnli": 2, 45 | } 46 | 47 | 48 | logging.basicConfig(level=logging.INFO) 49 | 50 | 51 | def convert_xlnet_checkpoint_to_pytorch( 52 | tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None 53 | ): 54 | # Initialise PyTorch model 55 | config = XLNetConfig.from_json_file(bert_config_file) 56 | 57 | finetuning_task = finetuning_task.lower() if finetuning_task is not None else "" 58 | if finetuning_task in GLUE_TASKS_NUM_LABELS: 59 | print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config))) 60 | config.finetuning_task = finetuning_task 61 | config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task] 62 | model = XLNetForSequenceClassification(config) 63 | elif "squad" in finetuning_task: 64 | config.finetuning_task = finetuning_task 65 | model = XLNetForQuestionAnswering(config) 66 | else: 67 | model = XLNetLMHeadModel(config) 68 | 69 | # Load weights from tf checkpoint 70 | load_tf_weights_in_xlnet(model, config, tf_checkpoint_path) 71 | 72 | # Save pytorch-model 73 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) 74 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) 75 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) 76 | torch.save(model.state_dict(), pytorch_weights_dump_path) 77 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) 78 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 79 | f.write(config.to_json_string()) 80 | 81 | 82 | if __name__ == "__main__": 83 | parser = argparse.ArgumentParser() 84 | # Required parameters 85 | parser.add_argument( 86 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 87 | ) 88 | parser.add_argument( 89 | "--xlnet_config_file", 90 | default=None, 91 | type=str, 92 | required=True, 93 | help="The config json file corresponding to the pre-trained XLNet model. \n" 94 | "This specifies the model architecture.", 95 | ) 96 | parser.add_argument( 97 | "--pytorch_dump_folder_path", 98 | default=None, 99 | type=str, 100 | required=True, 101 | help="Path to the folder to store the PyTorch model or dataset/vocab.", 102 | ) 103 | parser.add_argument( 104 | "--finetuning_task", 105 | default=None, 106 | type=str, 107 | help="Name of a task on which the XLNet TensorFloaw model was fine-tuned", 108 | ) 109 | args = parser.parse_args() 110 | print(args) 111 | 112 | convert_xlnet_checkpoint_to_pytorch( 113 | args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task 114 | ) 115 | -------------------------------------------------------------------------------- /tests/test_modeling_tf_auto.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import logging 18 | import unittest 19 | 20 | from transformers import is_tf_available 21 | 22 | from .utils import SMALL_MODEL_IDENTIFIER, require_tf, slow 23 | 24 | 25 | if is_tf_available(): 26 | from transformers import ( 27 | AutoConfig, 28 | BertConfig, 29 | TFAutoModel, 30 | TFBertModel, 31 | TFAutoModelWithLMHead, 32 | TFBertForMaskedLM, 33 | TFAutoModelForSequenceClassification, 34 | TFBertForSequenceClassification, 35 | TFAutoModelForQuestionAnswering, 36 | TFBertForQuestionAnswering, 37 | ) 38 | 39 | 40 | @require_tf 41 | class TFAutoModelTest(unittest.TestCase): 42 | @slow 43 | def test_model_from_pretrained(self): 44 | import h5py 45 | 46 | self.assertTrue(h5py.version.hdf5_version.startswith("1.10")) 47 | 48 | logging.basicConfig(level=logging.INFO) 49 | # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 50 | for model_name in ["bert-base-uncased"]: 51 | config = AutoConfig.from_pretrained(model_name) 52 | self.assertIsNotNone(config) 53 | self.assertIsInstance(config, BertConfig) 54 | 55 | model = TFAutoModel.from_pretrained(model_name) 56 | self.assertIsNotNone(model) 57 | self.assertIsInstance(model, TFBertModel) 58 | 59 | @slow 60 | def test_lmhead_model_from_pretrained(self): 61 | logging.basicConfig(level=logging.INFO) 62 | # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 63 | for model_name in ["bert-base-uncased"]: 64 | config = AutoConfig.from_pretrained(model_name) 65 | self.assertIsNotNone(config) 66 | self.assertIsInstance(config, BertConfig) 67 | 68 | model = TFAutoModelWithLMHead.from_pretrained(model_name) 69 | self.assertIsNotNone(model) 70 | self.assertIsInstance(model, TFBertForMaskedLM) 71 | 72 | @slow 73 | def test_sequence_classification_model_from_pretrained(self): 74 | logging.basicConfig(level=logging.INFO) 75 | # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 76 | for model_name in ["bert-base-uncased"]: 77 | config = AutoConfig.from_pretrained(model_name) 78 | self.assertIsNotNone(config) 79 | self.assertIsInstance(config, BertConfig) 80 | 81 | model = TFAutoModelForSequenceClassification.from_pretrained(model_name) 82 | self.assertIsNotNone(model) 83 | self.assertIsInstance(model, TFBertForSequenceClassification) 84 | 85 | @slow 86 | def test_question_answering_model_from_pretrained(self): 87 | logging.basicConfig(level=logging.INFO) 88 | # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 89 | for model_name in ["bert-base-uncased"]: 90 | config = AutoConfig.from_pretrained(model_name) 91 | self.assertIsNotNone(config) 92 | self.assertIsInstance(config, BertConfig) 93 | 94 | model = TFAutoModelForQuestionAnswering.from_pretrained(model_name) 95 | self.assertIsNotNone(model) 96 | self.assertIsInstance(model, TFBertForQuestionAnswering) 97 | 98 | def test_from_pretrained_identifier(self): 99 | logging.basicConfig(level=logging.INFO) 100 | model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER) 101 | self.assertIsInstance(model, TFBertForMaskedLM) 102 | -------------------------------------------------------------------------------- /tests/fixtures/sample_text.txt: -------------------------------------------------------------------------------- 1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত 2 | Text should be one-sentence-per-line, with empty lines between documents. 3 | This sample text is public domain and was randomly selected from Project Guttenberg. 4 | 5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors. 6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity. 7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them. 8 | "Cass" Beard had risen early that morning, but not with a view to discovery. 9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets. 10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency. 11 | This was nearly opposite. 12 | Mr. Cassius crossed the highway, and stopped suddenly. 13 | Something glittered in the nearest red pool before him. 14 | Gold, surely! 15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring. 16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass." 17 | Like most of his fellow gold-seekers, Cass was superstitious. 18 | 19 | The fountain of classic wisdom, Hypatia herself. 20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge. 21 | From my youth I felt in me a soul above the matter-entangled herd. 22 | She revealed to me the glorious fact, that I am a spark of Divinity itself. 23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's. 24 | There is a philosophic pleasure in opening one's treasures to the modest young. 25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street. 26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide; 27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind. 28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now. 29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert; 30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts. 31 | At last they reached the quay at the opposite end of the street; 32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers. 33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him. 34 | -------------------------------------------------------------------------------- /tests/test_modeling_auto.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import logging 18 | import unittest 19 | 20 | from transformers import is_torch_available 21 | 22 | from .utils import SMALL_MODEL_IDENTIFIER, require_torch, slow 23 | 24 | 25 | if is_torch_available(): 26 | from transformers import ( 27 | AutoConfig, 28 | BertConfig, 29 | AutoModel, 30 | BertModel, 31 | AutoModelWithLMHead, 32 | BertForMaskedLM, 33 | AutoModelForSequenceClassification, 34 | BertForSequenceClassification, 35 | AutoModelForQuestionAnswering, 36 | BertForQuestionAnswering, 37 | ) 38 | from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP 39 | 40 | 41 | @require_torch 42 | class AutoModelTest(unittest.TestCase): 43 | @slow 44 | def test_model_from_pretrained(self): 45 | logging.basicConfig(level=logging.INFO) 46 | for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 47 | config = AutoConfig.from_pretrained(model_name) 48 | self.assertIsNotNone(config) 49 | self.assertIsInstance(config, BertConfig) 50 | 51 | model = AutoModel.from_pretrained(model_name) 52 | model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True) 53 | self.assertIsNotNone(model) 54 | self.assertIsInstance(model, BertModel) 55 | for value in loading_info.values(): 56 | self.assertEqual(len(value), 0) 57 | 58 | @slow 59 | def test_lmhead_model_from_pretrained(self): 60 | logging.basicConfig(level=logging.INFO) 61 | for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 62 | config = AutoConfig.from_pretrained(model_name) 63 | self.assertIsNotNone(config) 64 | self.assertIsInstance(config, BertConfig) 65 | 66 | model = AutoModelWithLMHead.from_pretrained(model_name) 67 | model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True) 68 | self.assertIsNotNone(model) 69 | self.assertIsInstance(model, BertForMaskedLM) 70 | 71 | @slow 72 | def test_sequence_classification_model_from_pretrained(self): 73 | logging.basicConfig(level=logging.INFO) 74 | for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 75 | config = AutoConfig.from_pretrained(model_name) 76 | self.assertIsNotNone(config) 77 | self.assertIsInstance(config, BertConfig) 78 | 79 | model = AutoModelForSequenceClassification.from_pretrained(model_name) 80 | model, loading_info = AutoModelForSequenceClassification.from_pretrained( 81 | model_name, output_loading_info=True 82 | ) 83 | self.assertIsNotNone(model) 84 | self.assertIsInstance(model, BertForSequenceClassification) 85 | 86 | @slow 87 | def test_question_answering_model_from_pretrained(self): 88 | logging.basicConfig(level=logging.INFO) 89 | for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 90 | config = AutoConfig.from_pretrained(model_name) 91 | self.assertIsNotNone(config) 92 | self.assertIsInstance(config, BertConfig) 93 | 94 | model = AutoModelForQuestionAnswering.from_pretrained(model_name) 95 | model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True) 96 | self.assertIsNotNone(model) 97 | self.assertIsInstance(model, BertForQuestionAnswering) 98 | 99 | def test_from_pretrained_identifier(self): 100 | logging.basicConfig(level=logging.INFO) 101 | model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER) 102 | self.assertIsInstance(model, BertForMaskedLM) 103 | -------------------------------------------------------------------------------- /docs/source/multilingual.rst: -------------------------------------------------------------------------------- 1 | Multi-lingual models 2 | ================================================ 3 | 4 | Most of the models available in this library are mono-lingual models (English, Chinese and German). A few 5 | multi-lingual models are available and have a different mechanisms than mono-lingual models. 6 | This page details the usage of these models. 7 | 8 | The two models that currently support multiple languages are BERT and XLM. 9 | 10 | XLM 11 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 12 | 13 | XLM has a total of 10 different checkpoints, only one of which is mono-lingual. The 9 remaining model checkpoints can 14 | be split in two categories: the checkpoints that make use of language embeddings, and those that don't 15 | 16 | XLM & Language Embeddings 17 | ------------------------------------------------ 18 | 19 | This section concerns the following checkpoints: 20 | 21 | - ``xlm-mlm-ende-1024`` (Masked language modeling, English-German) 22 | - ``xlm-mlm-enfr-1024`` (Masked language modeling, English-French) 23 | - ``xlm-mlm-enro-1024`` (Masked language modeling, English-Romanian) 24 | - ``xlm-mlm-xnli15-1024`` (Masked language modeling, XNLI languages) 25 | - ``xlm-mlm-tlm-xnli15-1024`` (Masked language modeling + Translation, XNLI languages) 26 | - ``xlm-clm-enfr-1024`` (Causal language modeling, English-French) 27 | - ``xlm-clm-ende-1024`` (Causal language modeling, English-German) 28 | 29 | These checkpoints require language embeddings that will specify the language used at inference time. These language 30 | embeddings are represented as a tensor that is of the same shape as the input ids passed to the model. The values in 31 | these tensors depend on the language used and are identifiable using the ``lang2id`` and ``id2lang`` attributes 32 | from the tokenizer. 33 | 34 | Here is an example using the ``xlm-clm-enfr-1024`` checkpoint (Causal language modeling, English-French): 35 | 36 | 37 | .. code-block:: 38 | 39 | import torch 40 | from transformers import XLMTokenizer, XLMWithLMHeadModel 41 | 42 | tokenizer = XLMTokenizer.from_pretrained("xlm-clm-1024-enfr") 43 | 44 | 45 | The different languages this model/tokenizer handles, as well as the ids of these languages are visible using the 46 | ``lang2id`` attribute: 47 | 48 | .. code-block:: 49 | 50 | print(tokenizer.lang2id) # {'en': 0, 'fr': 1} 51 | 52 | 53 | These ids should be used when passing a language parameter during a model pass. Let's define our inputs: 54 | 55 | .. code-block:: 56 | 57 | input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1 58 | 59 | 60 | We should now define the language embedding by using the previously defined language id. We want to create a tensor 61 | filled with the appropriate language ids, of the same size as input_ids. For english, the id is 0: 62 | 63 | .. code-block:: 64 | 65 | language_id = tokenizer.lang2id['en'] # 0 66 | langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0]) 67 | 68 | # We reshape it to be of size (batch_size, sequence_length) 69 | langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1) 70 | 71 | 72 | You can then feed it all as input to your model: 73 | 74 | .. code-block:: 75 | 76 | outputs = model(input_ids, langs=langs) 77 | 78 | 79 | The example `run_generation.py `__ 80 | can generate text using the CLM checkpoints from XLM, using the language embeddings. 81 | 82 | XLM without Language Embeddings 83 | ------------------------------------------------ 84 | 85 | This section concerns the following checkpoints: 86 | 87 | - ``xlm-mlm-17-1280`` (Masked language modeling, 17 languages) 88 | - ``xlm-mlm-100-1280`` (Masked language modeling, 100 languages) 89 | 90 | These checkpoints do not require language embeddings at inference time. These models are used to have generic 91 | sentence representations, differently from previously-mentioned XLM checkpoints. 92 | 93 | 94 | BERT 95 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 96 | 97 | BERT has two checkpoints that can be used for multi-lingual tasks: 98 | 99 | - ``bert-base-multilingual-uncased`` (Masked language modeling + Next sentence prediction, 102 languages) 100 | - ``bert-base-multilingual-cased`` (Masked language modeling + Next sentence prediction, 104 languages) 101 | 102 | These checkpoints do not require language embeddings at inference time. They should identify the language 103 | used in the context and infer accordingly. -------------------------------------------------------------------------------- /src/transformers/configuration_t5.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2010, The T5 Authors and HuggingFace Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ T5 model configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | T5_PRETRAINED_CONFIG_ARCHIVE_MAP = { 26 | "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json", 27 | "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json", 28 | "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json", 29 | "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json", 30 | "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json", 31 | } 32 | 33 | 34 | class T5Config(PretrainedConfig): 35 | r""" 36 | :class:`~transformers.T5Config` is the configuration class to store the configuration of a 37 | `T5Model`. 38 | 39 | 40 | Arguments: 41 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`. 42 | hidden_size: Size of the encoder layers and the pooler layer. 43 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 44 | num_attention_heads: Number of attention heads for each attention layer in 45 | the Transformer encoder. 46 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 47 | layer in the Transformer encoder. 48 | hidden_act: The non-linear activation function (function or string) in the 49 | encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. 50 | hidden_dropout_prob: The dropout probabilitiy for all fully connected 51 | layers in the embeddings, encoder, and pooler. 52 | attention_probs_dropout_prob: The dropout ratio for the attention 53 | probabilities. 54 | max_position_embeddings: The maximum sequence length that this model might 55 | ever be used with. Typically set this to something large just in case 56 | (e.g., 512 or 1024 or 2048). 57 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 58 | `T5Model`. 59 | initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing). 60 | layer_norm_eps: The epsilon used by LayerNorm. 61 | """ 62 | pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP 63 | 64 | def __init__( 65 | self, 66 | vocab_size=32128, 67 | n_positions=512, 68 | d_model=512, 69 | d_kv=64, 70 | d_ff=2048, 71 | num_layers=6, 72 | num_heads=8, 73 | relative_attention_num_buckets=32, 74 | dropout_rate=0.1, 75 | layer_norm_epsilon=1e-6, 76 | initializer_factor=1.0, 77 | **kwargs 78 | ): 79 | super(T5Config, self).__init__(**kwargs) 80 | self.vocab_size = vocab_size 81 | self.n_positions = n_positions 82 | self.d_model = d_model 83 | self.d_kv = d_kv 84 | self.d_ff = d_ff 85 | self.num_layers = num_layers 86 | self.num_heads = num_heads 87 | self.relative_attention_num_buckets = relative_attention_num_buckets 88 | self.dropout_rate = dropout_rate 89 | self.layer_norm_epsilon = layer_norm_epsilon 90 | self.initializer_factor = initializer_factor 91 | 92 | @property 93 | def max_position_embeddings(self): 94 | return self.n_positions 95 | 96 | @property 97 | def hidden_size(self): 98 | return self.d_model 99 | 100 | @property 101 | def num_attention_heads(self): 102 | return self.num_heads 103 | 104 | @property 105 | def num_hidden_layers(self): 106 | return self.num_layers 107 | -------------------------------------------------------------------------------- /src/transformers/configuration_openai.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ OpenAI GPT configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_utils import PretrainedConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json" 28 | } 29 | 30 | 31 | class OpenAIGPTConfig(PretrainedConfig): 32 | """ 33 | Configuration class to store the configuration of a `OpenAIGPTModel`. 34 | 35 | Args: 36 | vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. 37 | n_positions: Number of positional embeddings. 38 | n_ctx: Size of the causal mask (usually same as n_positions). 39 | n_embd: Dimensionality of the embeddings and hidden states. 40 | n_layer: Number of hidden layers in the Transformer encoder. 41 | n_head: Number of attention heads for each attention layer in 42 | the Transformer encoder. 43 | afn: The non-linear activation function (function or string) in the 44 | encoder and pooler. If string, "gelu", "relu" and "swish" are supported. 45 | resid_pdrop: The dropout probabilitiy for all fully connected 46 | layers in the embeddings, encoder, and pooler. 47 | attn_pdrop: The dropout ratio for the attention 48 | probabilities. 49 | embd_pdrop: The dropout ratio for the embeddings. 50 | layer_norm_epsilon: epsilon to use in the layer norm layers 51 | initializer_range: The sttdev of the truncated_normal_initializer for 52 | initializing all weight matrices. 53 | predict_special_tokens: should we predict special tokens (when the model has a LM head) 54 | """ 55 | 56 | pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP 57 | 58 | def __init__( 59 | self, 60 | vocab_size=40478, 61 | n_positions=512, 62 | n_ctx=512, 63 | n_embd=768, 64 | n_layer=12, 65 | n_head=12, 66 | afn="gelu", 67 | resid_pdrop=0.1, 68 | embd_pdrop=0.1, 69 | attn_pdrop=0.1, 70 | layer_norm_epsilon=1e-5, 71 | initializer_range=0.02, 72 | predict_special_tokens=True, 73 | summary_type="cls_index", 74 | summary_use_proj=True, 75 | summary_activation=None, 76 | summary_proj_to_labels=True, 77 | summary_first_dropout=0.1, 78 | **kwargs 79 | ): 80 | """Constructs OpenAIGPTConfig. 81 | """ 82 | super(OpenAIGPTConfig, self).__init__(**kwargs) 83 | self.vocab_size = vocab_size 84 | self.n_ctx = n_ctx 85 | self.n_positions = n_positions 86 | self.n_embd = n_embd 87 | self.n_layer = n_layer 88 | self.n_head = n_head 89 | self.afn = afn 90 | self.resid_pdrop = resid_pdrop 91 | self.embd_pdrop = embd_pdrop 92 | self.attn_pdrop = attn_pdrop 93 | self.layer_norm_epsilon = layer_norm_epsilon 94 | self.initializer_range = initializer_range 95 | self.predict_special_tokens = predict_special_tokens 96 | self.summary_type = summary_type 97 | self.summary_use_proj = summary_use_proj 98 | self.summary_activation = summary_activation 99 | self.summary_first_dropout = summary_first_dropout 100 | self.summary_proj_to_labels = summary_proj_to_labels 101 | 102 | @property 103 | def max_position_embeddings(self): 104 | return self.n_positions 105 | 106 | @property 107 | def hidden_size(self): 108 | return self.n_embd 109 | 110 | @property 111 | def num_attention_heads(self): 112 | return self.n_head 113 | 114 | @property 115 | def num_hidden_layers(self): 116 | return self.n_layer 117 | -------------------------------------------------------------------------------- /src/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" 17 | 18 | import argparse 19 | import os 20 | 21 | import numpy as np 22 | import tensorflow as tf 23 | import torch 24 | 25 | from transformers import BertModel 26 | 27 | 28 | def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): 29 | 30 | """ 31 | :param model:BertModel Pytorch model instance to be converted 32 | :param ckpt_dir: Tensorflow model directory 33 | :param model_name: model name 34 | :return: 35 | 36 | Currently supported HF models: 37 | Y BertModel 38 | N BertForMaskedLM 39 | N BertForPreTraining 40 | N BertForMultipleChoice 41 | N BertForNextSentencePrediction 42 | N BertForSequenceClassification 43 | N BertForQuestionAnswering 44 | """ 45 | 46 | tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") 47 | 48 | var_map = ( 49 | ("layer.", "layer_"), 50 | ("word_embeddings.weight", "word_embeddings"), 51 | ("position_embeddings.weight", "position_embeddings"), 52 | ("token_type_embeddings.weight", "token_type_embeddings"), 53 | (".", "/"), 54 | ("LayerNorm/weight", "LayerNorm/gamma"), 55 | ("LayerNorm/bias", "LayerNorm/beta"), 56 | ("weight", "kernel"), 57 | ) 58 | 59 | if not os.path.isdir(ckpt_dir): 60 | os.makedirs(ckpt_dir) 61 | 62 | state_dict = model.state_dict() 63 | 64 | def to_tf_var_name(name: str): 65 | for patt, repl in iter(var_map): 66 | name = name.replace(patt, repl) 67 | return "bert/{}".format(name) 68 | 69 | def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): 70 | tf_dtype = tf.dtypes.as_dtype(tensor.dtype) 71 | tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) 72 | session.run(tf.variables_initializer([tf_var])) 73 | session.run(tf_var) 74 | return tf_var 75 | 76 | tf.reset_default_graph() 77 | with tf.Session() as session: 78 | for var_name in state_dict: 79 | tf_name = to_tf_var_name(var_name) 80 | torch_tensor = state_dict[var_name].numpy() 81 | if any([x in var_name for x in tensors_to_transpose]): 82 | torch_tensor = torch_tensor.T 83 | tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) 84 | tf.keras.backend.set_value(tf_var, torch_tensor) 85 | tf_weight = session.run(tf_var) 86 | print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor))) 87 | 88 | saver = tf.train.Saver(tf.trainable_variables()) 89 | saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) 90 | 91 | 92 | def main(raw_args=None): 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased") 95 | parser.add_argument( 96 | "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model" 97 | ) 98 | parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/.bin") 99 | parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") 100 | args = parser.parse_args(raw_args) 101 | 102 | model = BertModel.from_pretrained( 103 | pretrained_model_name_or_path=args.model_name, 104 | state_dict=torch.load(args.pytorch_model_path), 105 | cache_dir=args.cache_dir, 106 | ) 107 | 108 | convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name) 109 | 110 | 111 | if __name__ == "__main__": 112 | main() 113 | -------------------------------------------------------------------------------- /tests/test_tokenization_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import json 18 | import os 19 | import unittest 20 | 21 | from transformers.tokenization_roberta import VOCAB_FILES_NAMES, RobertaTokenizer 22 | 23 | from .test_tokenization_common import TokenizerTesterMixin 24 | from .utils import slow 25 | 26 | 27 | class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): 28 | tokenizer_class = RobertaTokenizer 29 | 30 | def setUp(self): 31 | super(RobertaTokenizationTest, self).setUp() 32 | 33 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 34 | vocab = [ 35 | "l", 36 | "o", 37 | "w", 38 | "e", 39 | "r", 40 | "s", 41 | "t", 42 | "i", 43 | "d", 44 | "n", 45 | "\u0120", 46 | "\u0120l", 47 | "\u0120n", 48 | "\u0120lo", 49 | "\u0120low", 50 | "er", 51 | "\u0120lowest", 52 | "\u0120newer", 53 | "\u0120wider", 54 | "", 55 | ] 56 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 57 | merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] 58 | self.special_tokens_map = {"unk_token": ""} 59 | 60 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) 61 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) 62 | with open(self.vocab_file, "w", encoding="utf-8") as fp: 63 | fp.write(json.dumps(vocab_tokens) + "\n") 64 | with open(self.merges_file, "w", encoding="utf-8") as fp: 65 | fp.write("\n".join(merges)) 66 | 67 | def get_tokenizer(self, **kwargs): 68 | kwargs.update(self.special_tokens_map) 69 | return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs) 70 | 71 | def get_input_output_texts(self): 72 | input_text = "lower newer" 73 | output_text = "lower newer" 74 | return input_text, output_text 75 | 76 | def test_full_tokenizer(self): 77 | tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) 78 | text = "lower newer" 79 | bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"] 80 | tokens = tokenizer.tokenize(text, add_prefix_space=True) 81 | self.assertListEqual(tokens, bpe_tokens) 82 | 83 | input_tokens = tokens + [tokenizer.unk_token] 84 | input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] 85 | self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 86 | 87 | def roberta_dict_integration_testing(self): 88 | tokenizer = self.get_tokenizer() 89 | 90 | self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2]) 91 | self.assertListEqual( 92 | tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False), 93 | [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2], 94 | ) 95 | 96 | @slow 97 | def test_sequence_builders(self): 98 | tokenizer = RobertaTokenizer.from_pretrained("roberta-base") 99 | 100 | text = tokenizer.encode("sequence builders", add_special_tokens=False) 101 | text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) 102 | 103 | encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True) 104 | encoded_pair_from_decode = tokenizer.encode( 105 | "sequence builders", "multi-sequence build", add_special_tokens=True 106 | ) 107 | 108 | encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) 109 | encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) 110 | 111 | assert encoded_sentence == encoded_text_from_decode 112 | assert encoded_pair == encoded_pair_from_decode 113 | -------------------------------------------------------------------------------- /templates/adding_a_new_model/configuration_xxx.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2010, XXX authors 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ XXX model configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | XXX_PRETRAINED_CONFIG_ARCHIVE_MAP = { 26 | "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-config.json", 27 | "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-config.json", 28 | } 29 | 30 | 31 | class XxxConfig(PretrainedConfig): 32 | r""" 33 | :class:`~transformers.XxxConfig` is the configuration class to store the configuration of a 34 | `XxxModel`. 35 | 36 | 37 | Arguments: 38 | vocab_size: Vocabulary size of `inputs_ids` in `XxxModel`. 39 | hidden_size: Size of the encoder layers and the pooler layer. 40 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 41 | num_attention_heads: Number of attention heads for each attention layer in 42 | the Transformer encoder. 43 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 44 | layer in the Transformer encoder. 45 | hidden_act: The non-linear activation function (function or string) in the 46 | encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. 47 | hidden_dropout_prob: The dropout probabilitiy for all fully connected 48 | layers in the embeddings, encoder, and pooler. 49 | attention_probs_dropout_prob: The dropout ratio for the attention 50 | probabilities. 51 | max_position_embeddings: The maximum sequence length that this model might 52 | ever be used with. Typically set this to something large just in case 53 | (e.g., 512 or 1024 or 2048). 54 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 55 | `XxxModel`. 56 | initializer_range: The sttdev of the truncated_normal_initializer for 57 | initializing all weight matrices. 58 | layer_norm_eps: The epsilon used by LayerNorm. 59 | """ 60 | pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP 61 | 62 | def __init__( 63 | self, 64 | vocab_size=50257, 65 | n_positions=1024, 66 | n_ctx=1024, 67 | n_embd=768, 68 | n_layer=12, 69 | n_head=12, 70 | resid_pdrop=0.1, 71 | embd_pdrop=0.1, 72 | attn_pdrop=0.1, 73 | layer_norm_epsilon=1e-5, 74 | initializer_range=0.02, 75 | summary_type="cls_index", 76 | summary_use_proj=True, 77 | summary_activation=None, 78 | summary_proj_to_labels=True, 79 | summary_first_dropout=0.1, 80 | **kwargs 81 | ): 82 | super(XxxConfig, self).__init__(**kwargs) 83 | self.vocab_size = vocab_size 84 | self.n_ctx = n_ctx 85 | self.n_positions = n_positions 86 | self.n_embd = n_embd 87 | self.n_layer = n_layer 88 | self.n_head = n_head 89 | self.resid_pdrop = resid_pdrop 90 | self.embd_pdrop = embd_pdrop 91 | self.attn_pdrop = attn_pdrop 92 | self.layer_norm_epsilon = layer_norm_epsilon 93 | self.initializer_range = initializer_range 94 | self.summary_type = summary_type 95 | self.summary_use_proj = summary_use_proj 96 | self.summary_activation = summary_activation 97 | self.summary_first_dropout = summary_first_dropout 98 | self.summary_proj_to_labels = summary_proj_to_labels 99 | 100 | @property 101 | def max_position_embeddings(self): 102 | return self.n_positions 103 | 104 | @property 105 | def hidden_size(self): 106 | return self.n_embd 107 | 108 | @property 109 | def num_attention_heads(self): 110 | return self.n_head 111 | 112 | @property 113 | def num_hidden_layers(self): 114 | return self.n_layer 115 | -------------------------------------------------------------------------------- /docs/source/converting_tensorflow_models.rst: -------------------------------------------------------------------------------- 1 | Converting Tensorflow Checkpoints 2 | ================================================ 3 | 4 | A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models than be loaded using the ``from_pretrained`` methods of the library. 5 | 6 | BERT 7 | ^^^^ 8 | 9 | You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google `_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py `_ script. 10 | 11 | This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py `_\ , `run_bert_classifier.py `_ and `run_bert_squad.py `_\ ). 12 | 13 | You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\ ``bert_config.json``\ ) and the vocabulary file (\ ``vocab.txt``\ ) as these are needed for the PyTorch model too. 14 | 15 | To run this specific conversion script you will need to have TensorFlow and PyTorch installed (\ ``pip install tensorflow``\ ). The rest of the repository only requires PyTorch. 16 | 17 | Here is an example of the conversion process for a pre-trained ``BERT-Base Uncased`` model: 18 | 19 | .. code-block:: shell 20 | 21 | export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12 22 | 23 | transformers bert \ 24 | $BERT_BASE_DIR/bert_model.ckpt \ 25 | $BERT_BASE_DIR/bert_config.json \ 26 | $BERT_BASE_DIR/pytorch_model.bin 27 | 28 | You can download Google's pre-trained models for the conversion `here `__. 29 | 30 | OpenAI GPT 31 | ^^^^^^^^^^ 32 | 33 | Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint save as the same format than OpenAI pretrained model (see `here `__\ ) 34 | 35 | .. code-block:: shell 36 | 37 | export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights 38 | 39 | transformers gpt \ 40 | $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \ 41 | $PYTORCH_DUMP_OUTPUT \ 42 | [OPENAI_GPT_CONFIG] 43 | 44 | OpenAI GPT-2 45 | ^^^^^^^^^^^^ 46 | 47 | Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here `__\ ) 48 | 49 | .. code-block:: shell 50 | 51 | export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights 52 | 53 | transformers gpt2 \ 54 | $OPENAI_GPT2_CHECKPOINT_PATH \ 55 | $PYTORCH_DUMP_OUTPUT \ 56 | [OPENAI_GPT2_CONFIG] 57 | 58 | Transformer-XL 59 | ^^^^^^^^^^^^^^ 60 | 61 | Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here `__\ ) 62 | 63 | .. code-block:: shell 64 | 65 | export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint 66 | 67 | transformers transfo_xl \ 68 | $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \ 69 | $PYTORCH_DUMP_OUTPUT \ 70 | [TRANSFO_XL_CONFIG] 71 | 72 | 73 | XLNet 74 | ^^^^^ 75 | 76 | Here is an example of the conversion process for a pre-trained XLNet model, fine-tuned on STS-B using the TensorFlow script: 77 | 78 | .. code-block:: shell 79 | 80 | export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint 81 | export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config 82 | 83 | transformers xlnet \ 84 | $TRANSFO_XL_CHECKPOINT_PATH \ 85 | $TRANSFO_XL_CONFIG_PATH \ 86 | $PYTORCH_DUMP_OUTPUT \ 87 | STS-B \ 88 | 89 | 90 | XLM 91 | ^^^ 92 | 93 | Here is an example of the conversion process for a pre-trained XLM model: 94 | 95 | .. code-block:: shell 96 | 97 | export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint 98 | 99 | transformers xlm \ 100 | $XLM_CHECKPOINT_PATH \ 101 | $PYTORCH_DUMP_OUTPUT \ 102 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py 3 | 4 | To create the package for pypi. 5 | 6 | 1. Change the version in __init__.py, setup.py as well as docs/source/conf.py. 7 | 8 | 2. Commit these changes with the message: "Release: VERSION" 9 | 10 | 3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' " 11 | Push the tag to git: git push --tags origin master 12 | 13 | 4. Build both the sources and the wheel. Do not change anything in setup.py between 14 | creating the wheel and the source distribution (obviously). 15 | 16 | For the wheel, run: "python setup.py bdist_wheel" in the top level directory. 17 | (this will build a wheel for the python version you use to build it). 18 | 19 | For the sources, run: "python setup.py sdist" 20 | You should now have a /dist directory with both .whl and .tar.gz source versions. 21 | 22 | 5. Check that everything looks correct by uploading the package to the pypi test server: 23 | 24 | twine upload dist/* -r pypitest 25 | (pypi suggest using twine as other methods upload files via plaintext.) 26 | 27 | Check that you can install it in a virtualenv by running: 28 | pip install -i https://testpypi.python.org/pypi transformers 29 | 30 | 6. Upload the final version to actual pypi: 31 | twine upload dist/* -r pypi 32 | 33 | 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory. 34 | 35 | """ 36 | 37 | import shutil 38 | from pathlib import Path 39 | 40 | from setuptools import find_packages, setup 41 | 42 | 43 | # Remove stale transformers.egg-info directory to avoid https://github.com/pypa/pip/issues/5466 44 | stale_egg_info = Path(__file__).parent / "transformers.egg-info" 45 | if stale_egg_info.exists(): 46 | print( 47 | ( 48 | "Warning: {} exists.\n\n" 49 | "If you recently updated transformers to 3.0 or later, this is expected,\n" 50 | "but it may prevent transformers from installing in editable mode.\n\n" 51 | "This directory is automatically generated by Python's packaging tools.\n" 52 | "I will remove it now.\n\n" 53 | "See https://github.com/pypa/pip/issues/5466 for details.\n" 54 | ).format(stale_egg_info) 55 | ) 56 | shutil.rmtree(stale_egg_info) 57 | 58 | 59 | extras = {} 60 | 61 | extras["mecab"] = ["mecab-python3"] 62 | extras["sklearn"] = ["scikit-learn"] 63 | extras["tf"] = ["tensorflow"] 64 | extras["torch"] = ["torch"] 65 | 66 | extras["serving"] = ["pydantic", "uvicorn", "fastapi"] 67 | extras["all"] = extras["serving"] + ["tensorflow", "torch"] 68 | 69 | extras["testing"] = ["pytest", "pytest-xdist"] 70 | extras["quality"] = ["black", "isort", "flake8"] 71 | extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme"] 72 | extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow", "torch"] 73 | 74 | setup( 75 | name="transformers", 76 | version="2.3.0", 77 | author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors", 78 | author_email="thomas@huggingface.co", 79 | description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch", 80 | long_description=open("README.md", "r", encoding="utf-8").read(), 81 | long_description_content_type="text/markdown", 82 | keywords="NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU", 83 | license="Apache", 84 | url="https://github.com/huggingface/transformers", 85 | package_dir={"": "src"}, 86 | packages=find_packages("src"), 87 | install_requires=[ 88 | "numpy", 89 | # accessing files from S3 directly 90 | "boto3", 91 | # filesystem locks e.g. to prevent parallel downloads 92 | "filelock", 93 | # for downloading models over HTTPS 94 | "requests", 95 | # progress bars in model download and training scripts 96 | "tqdm", 97 | # for OpenAI GPT 98 | "regex != 2019.12.17", 99 | # for XLNet 100 | "sentencepiece", 101 | # for XLM 102 | "sacremoses", 103 | ], 104 | extras_require=extras, 105 | scripts=["transformers-cli"], 106 | python_requires=">=3.5.0", 107 | classifiers=[ 108 | "Development Status :: 5 - Production/Stable", 109 | "Intended Audience :: Developers", 110 | "Intended Audience :: Education", 111 | "Intended Audience :: Science/Research", 112 | "License :: OSI Approved :: Apache Software License", 113 | "Operating System :: OS Independent", 114 | "Programming Language :: Python :: 3", 115 | "Programming Language :: Python :: 3.5", 116 | "Programming Language :: Python :: 3.6", 117 | "Programming Language :: Python :: 3.7", 118 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 119 | ], 120 | ) 121 | -------------------------------------------------------------------------------- /src/transformers/configuration_ctrl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Salesforce and HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Salesforce CTRL configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"} 26 | 27 | 28 | class CTRLConfig(PretrainedConfig): 29 | """Configuration class to store the configuration of a `CTRLModel`. 30 | 31 | Args: 32 | vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. 33 | n_positions: Number of positional embeddings. 34 | n_ctx: Size of the causal mask (usually same as n_positions). 35 | dff: Size of the inner dimension of the FFN. 36 | n_embd: Dimensionality of the embeddings and hidden states. 37 | n_layer: Number of hidden layers in the Transformer encoder. 38 | n_head: Number of attention heads for each attention layer in 39 | the Transformer encoder. 40 | layer_norm_epsilon: epsilon to use in the layer norm layers 41 | resid_pdrop: The dropout probabilitiy for all fully connected 42 | layers in the embeddings, encoder, and pooler. 43 | attn_pdrop: The dropout ratio for the attention 44 | probabilities. 45 | embd_pdrop: The dropout ratio for the embeddings. 46 | initializer_range: The sttdev of the truncated_normal_initializer for 47 | initializing all weight matrices. 48 | """ 49 | 50 | pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP 51 | 52 | def __init__( 53 | self, 54 | vocab_size=246534, 55 | n_positions=256, 56 | n_ctx=256, 57 | n_embd=1280, 58 | dff=8192, 59 | n_layer=48, 60 | n_head=16, 61 | resid_pdrop=0.1, 62 | embd_pdrop=0.1, 63 | attn_pdrop=0.1, 64 | layer_norm_epsilon=1e-6, 65 | initializer_range=0.02, 66 | summary_type="cls_index", 67 | summary_use_proj=True, 68 | summary_activation=None, 69 | summary_proj_to_labels=True, 70 | summary_first_dropout=0.1, 71 | **kwargs 72 | ): 73 | """Constructs CTRLConfig. 74 | 75 | Args: 76 | vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. 77 | n_positions: Number of positional embeddings. 78 | n_ctx: Size of the causal mask (usually same as n_positions). 79 | dff: Size of the inner dimension of the FFN. 80 | n_embd: Dimensionality of the embeddings and hidden states. 81 | n_layer: Number of hidden layers in the Transformer encoder. 82 | n_head: Number of attention heads for each attention layer in 83 | the Transformer encoder. 84 | layer_norm_epsilon: epsilon to use in the layer norm layers 85 | resid_pdrop: The dropout probabilitiy for all fully connected 86 | layers in the embeddings, encoder, and pooler. 87 | attn_pdrop: The dropout ratio for the attention 88 | probabilities. 89 | embd_pdrop: The dropout ratio for the embeddings. 90 | initializer_range: The sttdev of the truncated_normal_initializer for 91 | initializing all weight matrices. 92 | """ 93 | super(CTRLConfig, self).__init__(**kwargs) 94 | self.vocab_size = vocab_size 95 | self.n_ctx = n_ctx 96 | self.n_positions = n_positions 97 | self.n_embd = n_embd 98 | self.n_layer = n_layer 99 | self.n_head = n_head 100 | self.dff = dff 101 | self.resid_pdrop = resid_pdrop 102 | self.embd_pdrop = embd_pdrop 103 | self.attn_pdrop = attn_pdrop 104 | self.layer_norm_epsilon = layer_norm_epsilon 105 | self.initializer_range = initializer_range 106 | 107 | self.summary_type = summary_type 108 | self.summary_use_proj = summary_use_proj 109 | self.summary_activation = summary_activation 110 | self.summary_first_dropout = summary_first_dropout 111 | self.summary_proj_to_labels = summary_proj_to_labels 112 | 113 | @property 114 | def max_position_embeddings(self): 115 | return self.n_positions 116 | 117 | @property 118 | def hidden_size(self): 119 | return self.n_embd 120 | 121 | @property 122 | def num_attention_heads(self): 123 | return self.n_head 124 | 125 | @property 126 | def num_hidden_layers(self): 127 | return self.n_layer 128 | --------------------------------------------------------------------------------