├── transformers ├── utils │ ├── __init__.py │ ├── dummy_flax_objects.py │ ├── model_parallel_utils.py │ └── dummy_sentencepiece_objects.py ├── benchmark │ └── __init__.py ├── models │ ├── dialogpt │ │ ├── __init__.py │ │ └── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py │ ├── xlm_prophetnet │ │ ├── configuration_xlm_prophetnet.py │ │ └── __init__.py │ ├── camembert │ │ └── configuration_camembert.py │ ├── __init__.py │ ├── phobert │ │ └── __init__.py │ ├── bertweet │ │ └── __init__.py │ ├── bert_japanese │ │ └── __init__.py │ ├── mmbt │ │ ├── configuration_mmbt.py │ │ └── __init__.py │ ├── mobilebert │ │ ├── tokenization_mobilebert.py │ │ ├── tokenization_mobilebert_fast.py │ │ └── convert_mobilebert_original_tf_checkpoint_to_pytorch.py │ ├── herbert │ │ ├── __init__.py │ │ └── tokenization_herbert.py │ ├── encoder_decoder │ │ └── __init__.py │ ├── bart │ │ ├── tokenization_bart_fast.py │ │ ├── tokenization_bart.py │ │ └── __init__.py │ ├── led │ │ ├── tokenization_led.py │ │ ├── tokenization_led_fast.py │ │ └── __init__.py │ ├── retribert │ │ ├── tokenization_retribert.py │ │ ├── tokenization_retribert_fast.py │ │ └── __init__.py │ ├── xlm_roberta │ │ └── configuration_xlm_roberta.py │ ├── barthez │ │ └── __init__.py │ ├── fsmt │ │ └── __init__.py │ ├── rag │ │ ├── __init__.py │ │ └── tokenization_rag.py │ ├── layoutlm │ │ ├── tokenization_layoutlm.py │ │ ├── tokenization_layoutlm_fast.py │ │ └── __init__.py │ ├── mbart │ │ ├── convert_mbart_original_checkpoint_to_pytorch.py │ │ └── __init__.py │ ├── longformer │ │ ├── tokenization_longformer.py │ │ ├── tokenization_longformer_fast.py │ │ └── convert_longformer_original_pytorch_lightning_to_pytorch.py │ ├── t5 │ │ └── convert_t5_original_tf_checkpoint_to_pytorch.py │ ├── lxmert │ │ ├── convert_lxmert_original_tf_checkpoint_to_pytorch.py │ │ ├── tokenization_lxmert.py │ │ ├── tokenization_lxmert_fast.py │ │ └── __init__.py │ ├── bert │ │ └── convert_bert_original_tf_checkpoint_to_pytorch.py │ ├── albert │ │ └── convert_albert_original_tf_checkpoint_to_pytorch.py │ ├── tapas │ │ └── __init__.py │ ├── bert_generation │ │ └── __init__.py │ ├── squeezebert │ │ ├── tokenization_squeezebert.py │ │ ├── __init__.py │ │ └── tokenization_squeezebert_fast.py │ ├── funnel │ │ └── convert_funnel_original_tf_checkpoint_to_pytorch.py │ ├── deberta │ │ └── __init__.py │ ├── prophetnet │ │ └── __init__.py │ ├── blenderbot │ │ └── __init__.py │ ├── gpt2 │ │ └── convert_gpt2_original_tf_checkpoint_to_pytorch.py │ ├── marian │ │ └── __init__.py │ ├── blenderbot_small │ │ └── __init__.py │ ├── openai │ │ ├── convert_openai_original_tf_checkpoint_to_pytorch.py │ │ └── tokenization_openai_fast.py │ ├── ctrl │ │ └── __init__.py │ ├── roberta │ │ └── configuration_roberta.py │ ├── reformer │ │ └── __init__.py │ ├── electra │ │ ├── tokenization_electra.py │ │ └── convert_electra_original_tf_checkpoint_to_pytorch.py │ ├── pegasus │ │ └── __init__.py │ ├── distilbert │ │ └── tokenization_distilbert.py │ ├── mt5 │ │ └── __init__.py │ ├── xlm │ │ └── convert_xlm_original_pytorch_checkpoint_to_pytorch.py │ └── transfo_xl │ │ └── __init__.py ├── commands │ ├── __init__.py │ ├── download.py │ ├── transformers_cli.py │ └── env.py ├── data │ ├── datasets │ │ └── __init__.py │ ├── processors │ │ └── __init__.py │ └── __init__.py ├── dependency_versions_check.py ├── training_args_seq2seq.py ├── dependency_versions_table.py ├── activations_tf.py ├── convert_tf_hub_seq_to_seq_bert_to_pytorch.py ├── activations.py └── pipelines │ └── text_classification.py ├── test_command.txt ├── requirements.txt ├── README.md └── .gitignore /transformers/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /transformers/benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /transformers/models/dialogpt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test_command.txt: -------------------------------------------------------------------------------- 1 | python run.py --model_name_or_path facebook/bart-base --do_train --do_eval --task summarization --train_file data/news_summary_train_small.csv --validation_file data/news_summary_valid_small.csv --output_dir output/ --overwrite_output_dir --num_beams=3 --min_summ_length=100 --max_summ_length=320 --length_penalty=1.0 --per_device_train_batch_size=4 --per_device_eval_batch_size=4 --predict_with_generate --text_column Text --summary_column Summary 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.11.0 2 | certifi==2020.12.5 3 | chardet==4.0.0 4 | click==7.1.2 5 | datasets==1.2.1 6 | dill==0.3.3 7 | filelock==3.0.12 8 | idna==2.10 9 | joblib==1.0.0 10 | multiprocess==0.70.11.1 11 | nltk==3.5 12 | numpy==1.19.5 13 | packaging==20.8 14 | pandas==1.2.1 15 | pyarrow==2.0.0 16 | pyparsing==2.4.7 17 | python-dateutil==2.8.1 18 | pytz==2020.5 19 | regex==2020.11.13 20 | requests==2.25.1 21 | rouge-score==0.0.4 22 | sacremoses==0.0.43 23 | six==1.15.0 24 | tokenizers==0.9.4 25 | torch==1.7.1 26 | tqdm==4.49.0 27 | transformers==4.2.2 28 | typing-extensions==3.7.4.3 29 | urllib3==1.26.2 30 | xxhash==2.0.0 31 | -------------------------------------------------------------------------------- /transformers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | from argparse import ArgumentParser 17 | 18 | 19 | class BaseTransformersCLICommand(ABC): 20 | @staticmethod 21 | @abstractmethod 22 | def register_subcommand(parser: ArgumentParser): 23 | raise NotImplementedError() 24 | 25 | @abstractmethod 26 | def run(self): 27 | raise NotImplementedError() 28 | -------------------------------------------------------------------------------- /transformers/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .glue import GlueDataset, GlueDataTrainingArguments 20 | from .language_modeling import ( 21 | LineByLineTextDataset, 22 | LineByLineWithRefDataset, 23 | LineByLineWithSOPTextDataset, 24 | TextDataset, 25 | TextDatasetForNextSentencePrediction, 26 | ) 27 | from .squad import SquadDataset, SquadDataTrainingArguments 28 | -------------------------------------------------------------------------------- /transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels 20 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features 21 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor 22 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels 23 | -------------------------------------------------------------------------------- /transformers/utils/dummy_flax_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..file_utils import requires_flax 3 | 4 | 5 | class FlaxPreTrainedModel: 6 | def __init__(self, *args, **kwargs): 7 | requires_flax(self) 8 | 9 | @classmethod 10 | def from_pretrained(self, *args, **kwargs): 11 | requires_flax(self) 12 | 13 | 14 | FLAX_MODEL_MAPPING = None 15 | 16 | 17 | class FlaxAutoModel: 18 | def __init__(self, *args, **kwargs): 19 | requires_flax(self) 20 | 21 | @classmethod 22 | def from_pretrained(self, *args, **kwargs): 23 | requires_flax(self) 24 | 25 | 26 | class FlaxBertForMaskedLM: 27 | def __init__(self, *args, **kwargs): 28 | requires_flax(self) 29 | 30 | @classmethod 31 | def from_pretrained(self, *args, **kwargs): 32 | requires_flax(self) 33 | 34 | 35 | class FlaxBertModel: 36 | def __init__(self, *args, **kwargs): 37 | requires_flax(self) 38 | 39 | @classmethod 40 | def from_pretrained(self, *args, **kwargs): 41 | requires_flax(self) 42 | 43 | 44 | class FlaxRobertaModel: 45 | def __init__(self, *args, **kwargs): 46 | requires_flax(self) 47 | 48 | @classmethod 49 | def from_pretrained(self, *args, **kwargs): 50 | requires_flax(self) 51 | -------------------------------------------------------------------------------- /transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .metrics import glue_compute_metrics, xnli_compute_metrics 20 | from .processors import ( 21 | DataProcessor, 22 | InputExample, 23 | InputFeatures, 24 | SingleSentenceClassificationProcessor, 25 | SquadExample, 26 | SquadFeatures, 27 | SquadV1Processor, 28 | SquadV2Processor, 29 | glue_convert_examples_to_features, 30 | glue_output_modes, 31 | glue_processors, 32 | glue_tasks_num_labels, 33 | squad_convert_examples_to_features, 34 | xnli_output_modes, 35 | xnli_processors, 36 | xnli_tasks_num_labels, 37 | ) 38 | -------------------------------------------------------------------------------- /transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ XLM-ProphetNet model configuration """ 16 | 17 | 18 | from ...utils import logging 19 | from ..prophetnet.configuration_prophetnet import ProphetNetConfig 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { 25 | "microsoft/xprophetnet-large-wiki100-cased": "https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased/resolve/main/config.json", 26 | } 27 | 28 | 29 | class XLMProphetNetConfig(ProphetNetConfig): 30 | """ 31 | This class overrides :class:`~transformers.ProphetNetConfig`. Please check the superclass for the appropriate 32 | documentation alongside usage examples. 33 | """ 34 | 35 | model_type = "xlm-prophetnet" 36 | -------------------------------------------------------------------------------- /transformers/models/xlm_prophetnet/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_sentencepiece_available, is_torch_available 20 | from .configuration_xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig 21 | 22 | 23 | if is_sentencepiece_available(): 24 | from .tokenization_xlm_prophetnet import XLMProphetNetTokenizer 25 | 26 | if is_torch_available(): 27 | from .modeling_xlm_prophetnet import ( 28 | XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST, 29 | XLMProphetNetDecoder, 30 | XLMProphetNetEncoder, 31 | XLMProphetNetForCausalLM, 32 | XLMProphetNetForConditionalGeneration, 33 | XLMProphetNetModel, 34 | ) 35 | -------------------------------------------------------------------------------- /transformers/models/camembert/configuration_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ CamemBERT configuration """ 17 | 18 | from ...utils import logging 19 | from ..roberta.configuration_roberta import RobertaConfig 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 25 | "camembert-base": "https://huggingface.co/camembert-base/resolve/main/config.json", 26 | "umberto-commoncrawl-cased-v1": "https://huggingface.co/Musixmatch/umberto-commoncrawl-cased-v1/resolve/main/config.json", 27 | "umberto-wikipedia-uncased-v1": "https://huggingface.co/Musixmatch/umberto-wikipedia-uncased-v1/resolve/main/config.json", 28 | } 29 | 30 | 31 | class CamembertConfig(RobertaConfig): 32 | """ 33 | This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate 34 | documentation alongside usage examples. 35 | """ 36 | 37 | model_type = "camembert" 38 | -------------------------------------------------------------------------------- /transformers/models/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from . import ( 20 | albert, 21 | auto, 22 | bart, 23 | barthez, 24 | bert, 25 | bert_generation, 26 | bert_japanese, 27 | bertweet, 28 | blenderbot, 29 | blenderbot_small, 30 | camembert, 31 | ctrl, 32 | deberta, 33 | dialogpt, 34 | distilbert, 35 | dpr, 36 | electra, 37 | encoder_decoder, 38 | flaubert, 39 | fsmt, 40 | funnel, 41 | gpt2, 42 | herbert, 43 | layoutlm, 44 | led, 45 | longformer, 46 | lxmert, 47 | marian, 48 | mbart, 49 | mmbt, 50 | mobilebert, 51 | mpnet, 52 | mt5, 53 | openai, 54 | pegasus, 55 | phobert, 56 | prophetnet, 57 | rag, 58 | reformer, 59 | retribert, 60 | roberta, 61 | squeezebert, 62 | t5, 63 | tapas, 64 | transfo_xl, 65 | xlm, 66 | xlm_roberta, 67 | xlnet, 68 | ) 69 | -------------------------------------------------------------------------------- /transformers/models/phobert/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule 22 | 23 | 24 | _import_structure = { 25 | "tokenization_phobert": ["PhobertTokenizer"], 26 | } 27 | 28 | 29 | if TYPE_CHECKING: 30 | from .tokenization_phobert import PhobertTokenizer 31 | 32 | else: 33 | import importlib 34 | import os 35 | import sys 36 | 37 | class _LazyModule(_BaseLazyModule): 38 | """ 39 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 40 | """ 41 | 42 | __file__ = globals()["__file__"] 43 | __path__ = [os.path.dirname(__file__)] 44 | 45 | def _get_module(self, module_name: str): 46 | return importlib.import_module("." + module_name, self.__name__) 47 | 48 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 49 | -------------------------------------------------------------------------------- /transformers/models/bertweet/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule 22 | 23 | 24 | _import_structure = { 25 | "tokenization_bertweet": ["BertweetTokenizer"], 26 | } 27 | 28 | 29 | if TYPE_CHECKING: 30 | from .tokenization_bertweet import BertweetTokenizer 31 | 32 | else: 33 | import importlib 34 | import os 35 | import sys 36 | 37 | class _LazyModule(_BaseLazyModule): 38 | """ 39 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 40 | """ 41 | 42 | __file__ = globals()["__file__"] 43 | __path__ = [os.path.dirname(__file__)] 44 | 45 | def _get_module(self, module_name: str): 46 | return importlib.import_module("." + module_name, self.__name__) 47 | 48 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 49 | -------------------------------------------------------------------------------- /transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import os 17 | 18 | import torch 19 | 20 | from transformers.file_utils import WEIGHTS_NAME 21 | 22 | 23 | DIALOGPT_MODELS = ["small", "medium", "large"] 24 | 25 | OLD_KEY = "lm_head.decoder.weight" 26 | NEW_KEY = "lm_head.weight" 27 | 28 | 29 | def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str): 30 | d = torch.load(checkpoint_path) 31 | d[NEW_KEY] = d.pop(OLD_KEY) 32 | os.makedirs(pytorch_dump_folder_path, exist_ok=True) 33 | torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)) 34 | 35 | 36 | if __name__ == "__main__": 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument("--dialogpt_path", default=".", type=str) 39 | args = parser.parse_args() 40 | for MODEL in DIALOGPT_MODELS: 41 | checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl") 42 | pytorch_dump_folder_path = f"./DialoGPT-{MODEL}" 43 | convert_dialogpt_checkpoint( 44 | checkpoint_path, 45 | pytorch_dump_folder_path, 46 | ) 47 | -------------------------------------------------------------------------------- /transformers/models/bert_japanese/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule 22 | 23 | 24 | _import_structure = { 25 | "tokenization_bert_japanese": ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"], 26 | } 27 | 28 | 29 | if TYPE_CHECKING: 30 | from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer 31 | 32 | else: 33 | import importlib 34 | import os 35 | import sys 36 | 37 | class _LazyModule(_BaseLazyModule): 38 | """ 39 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 40 | """ 41 | 42 | __file__ = globals()["__file__"] 43 | __path__ = [os.path.dirname(__file__)] 44 | 45 | def _get_module(self, module_name: str): 46 | return importlib.import_module("." + module_name, self.__name__) 47 | 48 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 49 | -------------------------------------------------------------------------------- /transformers/models/mmbt/configuration_mmbt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Copyright (c) HuggingFace Inc. team. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ MMBT configuration """ 17 | 18 | from ...utils import logging 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | 24 | class MMBTConfig(object): 25 | """ 26 | This is the configuration class to store the configuration of a :class:`~transformers.MMBTModel`. It is used to 27 | instantiate a MMBT model according to the specified arguments, defining the model architecture. 28 | 29 | Args: 30 | config (:class:`~transformers.PreTrainedConfig`): 31 | Config of the underlying Transformer models. Its values are copied over to use a single config. 32 | num_labels (:obj:`int`, `optional`): 33 | Size of final Linear layer for classification. 34 | modal_hidden_size (:obj:`int`, `optional`, defaults to 2048): 35 | Embedding dimension of the non-text modality encoder. 36 | """ 37 | 38 | def __init__(self, config, num_labels=None, modal_hidden_size=2048): 39 | self.__dict__ = config.__dict__ 40 | self.modal_hidden_size = modal_hidden_size 41 | if num_labels: 42 | self.num_labels = num_labels 43 | -------------------------------------------------------------------------------- /transformers/dependency_versions_check.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import sys 15 | 16 | from .dependency_versions_table import deps 17 | from .utils.versions import require_version_core 18 | 19 | 20 | # define which module versions we always want to check at run time 21 | # (usually the ones defined in `install_requires` in setup.py) 22 | # 23 | # order specific notes: 24 | # - tqdm must be checked before tokenizers 25 | 26 | pkgs_to_check_at_runtime = "python tqdm regex sacremoses requests packaging filelock numpy tokenizers".split() 27 | if sys.version_info < (3, 7): 28 | pkgs_to_check_at_runtime.append("dataclasses") 29 | if sys.version_info < (3, 8): 30 | pkgs_to_check_at_runtime.append("importlib_metadata") 31 | 32 | for pkg in pkgs_to_check_at_runtime: 33 | if pkg in deps: 34 | if pkg == "tokenizers": 35 | # must be loaded here, or else tqdm check may fail 36 | from .file_utils import is_tokenizers_available 37 | 38 | if not is_tokenizers_available(): 39 | continue # not required, check version only if installed 40 | 41 | require_version_core(deps[pkg]) 42 | else: 43 | raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py") 44 | -------------------------------------------------------------------------------- /transformers/training_args_seq2seq.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | from dataclasses import dataclass, field 17 | 18 | from .file_utils import add_start_docstrings 19 | from .training_args import TrainingArguments 20 | 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | @dataclass 26 | @add_start_docstrings(TrainingArguments.__doc__) 27 | class Seq2SeqTrainingArguments(TrainingArguments): 28 | """ 29 | sortish_sampler (:obj:`bool`, `optional`, defaults to :obj:`False`): 30 | Whether to use a `sortish sampler` or not. Only possible if the underlying datasets are `Seq2SeqDataset` for 31 | now but will become generally available in the near future. 32 | 33 | It sorts the inputs according to lengths in order to minimize the padding size, with a bit of randomness for 34 | the training set. 35 | predict_with_generate (:obj:`bool`, `optional`, defaults to :obj:`False`): 36 | Whether to use generate to calculate generative metrics (ROUGE, BLEU). 37 | """ 38 | 39 | sortish_sampler: bool = field(default=False, metadata={"help": "Whether to use SortishSampler or not."}) 40 | predict_with_generate: bool = field( 41 | default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."} 42 | ) 43 | -------------------------------------------------------------------------------- /transformers/models/mobilebert/tokenization_mobilebert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # 3 | # Copyright 2020 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """Tokenization classes for MobileBERT.""" 17 | 18 | from ...utils import logging 19 | from ..bert.tokenization_bert import BertTokenizer 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 25 | 26 | PRETRAINED_VOCAB_FILES_MAP = { 27 | "vocab_file": {"mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/vocab.txt"} 28 | } 29 | 30 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512} 31 | 32 | 33 | PRETRAINED_INIT_CONFIGURATION = {} 34 | 35 | 36 | class MobileBertTokenizer(BertTokenizer): 37 | r""" 38 | Construct a MobileBERT tokenizer. 39 | 40 | :class:`~transformers.MobileBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 41 | tokenization: punctuation splitting and wordpiece. 42 | 43 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 44 | parameters. 45 | """ 46 | 47 | vocab_files_names = VOCAB_FILES_NAMES 48 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 49 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 50 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 51 | -------------------------------------------------------------------------------- /transformers/models/herbert/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_tokenizers_available 22 | 23 | 24 | _import_structure = { 25 | "tokenization_herbert": ["HerbertTokenizer"], 26 | } 27 | 28 | if is_tokenizers_available(): 29 | _import_structure["tokenization_herbert_fast"] = ["HerbertTokenizerFast"] 30 | 31 | 32 | if TYPE_CHECKING: 33 | from .tokenization_herbert import HerbertTokenizer 34 | 35 | if is_tokenizers_available(): 36 | from .tokenization_herbert_fast import HerbertTokenizerFast 37 | 38 | else: 39 | import importlib 40 | import os 41 | import sys 42 | 43 | class _LazyModule(_BaseLazyModule): 44 | """ 45 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 46 | """ 47 | 48 | __file__ = globals()["__file__"] 49 | __path__ = [os.path.dirname(__file__)] 50 | 51 | def _get_module(self, module_name: str): 52 | return importlib.import_module("." + module_name, self.__name__) 53 | 54 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 55 | -------------------------------------------------------------------------------- /transformers/models/mmbt/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_mmbt": ["MMBTConfig"], 26 | } 27 | 28 | if is_torch_available(): 29 | _import_structure["modeling_mmbt"] = ["MMBTForClassification", "MMBTModel", "ModalEmbeddings"] 30 | 31 | 32 | if TYPE_CHECKING: 33 | from .configuration_mmbt import MMBTConfig 34 | 35 | if is_torch_available(): 36 | from .modeling_mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings 37 | 38 | else: 39 | import importlib 40 | import os 41 | import sys 42 | 43 | class _LazyModule(_BaseLazyModule): 44 | """ 45 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 46 | """ 47 | 48 | __file__ = globals()["__file__"] 49 | __path__ = [os.path.dirname(__file__)] 50 | 51 | def _get_module(self, module_name: str): 52 | return importlib.import_module("." + module_name, self.__name__) 53 | 54 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 55 | -------------------------------------------------------------------------------- /transformers/models/encoder_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_encoder_decoder": ["EncoderDecoderConfig"], 26 | } 27 | 28 | if is_torch_available(): 29 | _import_structure["modeling_encoder_decoder"] = ["EncoderDecoderModel"] 30 | 31 | 32 | if TYPE_CHECKING: 33 | from .configuration_encoder_decoder import EncoderDecoderConfig 34 | 35 | if is_torch_available(): 36 | from .modeling_encoder_decoder import EncoderDecoderModel 37 | 38 | else: 39 | import importlib 40 | import os 41 | import sys 42 | 43 | class _LazyModule(_BaseLazyModule): 44 | """ 45 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 46 | """ 47 | 48 | __file__ = globals()["__file__"] 49 | __path__ = [os.path.dirname(__file__)] 50 | 51 | def _get_module(self, module_name: str): 52 | return importlib.import_module("." + module_name, self.__name__) 53 | 54 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 55 | -------------------------------------------------------------------------------- /transformers/models/bart/tokenization_bart_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from ...utils import logging 17 | from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast 18 | from .tokenization_bart import BartTokenizer 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | 24 | # vocab and merges same as roberta 25 | vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json" 26 | merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt" 27 | tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json" 28 | _all_bart_models = [ 29 | "facebook/bart-base", 30 | "facebook/bart-large", 31 | "facebook/bart-large-mnli", 32 | "facebook/bart-large-cnn", 33 | "facebook/bart-large-xsum", 34 | "yjernite/bart_eli5", 35 | # This is not exhaustive: see https://huggingface.co/models?filter=bart 36 | ] 37 | 38 | 39 | class BartTokenizerFast(RobertaTokenizerFast): 40 | # merges and vocab same as Roberta 41 | max_model_input_sizes = {m: 1024 for m in _all_bart_models} 42 | pretrained_vocab_files_map = { 43 | "vocab_file": {m: vocab_url for m in _all_bart_models}, 44 | "merges_file": {m: merges_url for m in _all_bart_models}, 45 | "tokenizer_file": {m: tokenizer_url for m in _all_bart_models}, 46 | } 47 | slow_tokenizer_class = BartTokenizer 48 | -------------------------------------------------------------------------------- /transformers/dependency_versions_table.py: -------------------------------------------------------------------------------- 1 | # THIS FILE HAS BEEN AUTOGENERATED. To update: 2 | # 1. modify the `_deps` dict in setup.py 3 | # 2. run `make deps_table_update`` 4 | deps = { 5 | "black": "black>=20.8b1", 6 | "cookiecutter": "cookiecutter==1.7.2", 7 | "dataclasses": "dataclasses", 8 | "datasets": "datasets", 9 | "faiss-cpu": "faiss-cpu", 10 | "fastapi": "fastapi", 11 | "filelock": "filelock", 12 | "flake8": "flake8>=3.8.3", 13 | "flax": "flax>=0.2.2", 14 | "fugashi": "fugashi>=1.0", 15 | "importlib_metadata": "importlib_metadata", 16 | "ipadic": "ipadic>=1.0.0,<2.0", 17 | "isort": "isort>=5.5.4", 18 | "jax": "jax>=0.2.0", 19 | "jaxlib": "jaxlib==0.1.55", 20 | "keras2onnx": "keras2onnx", 21 | "numpy": "numpy", 22 | "onnxconverter-common": "onnxconverter-common", 23 | "onnxruntime-tools": "onnxruntime-tools>=1.4.2", 24 | "onnxruntime": "onnxruntime>=1.4.0", 25 | "packaging": "packaging", 26 | "parameterized": "parameterized", 27 | "protobuf": "protobuf", 28 | "psutil": "psutil", 29 | "pydantic": "pydantic", 30 | "pytest": "pytest", 31 | "pytest-xdist": "pytest-xdist", 32 | "python": "python>=3.6.0", 33 | "recommonmark": "recommonmark", 34 | "regex": "regex!=2019.12.17", 35 | "requests": "requests", 36 | "sacremoses": "sacremoses", 37 | "scikit-learn": "scikit-learn", 38 | "sentencepiece": "sentencepiece==0.1.91", 39 | "sphinx-copybutton": "sphinx-copybutton", 40 | "sphinx-markdown-tables": "sphinx-markdown-tables", 41 | "sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3", 42 | "sphinx": "sphinx==3.2.1", 43 | "starlette": "starlette", 44 | "tensorflow-cpu": "tensorflow-cpu>=2.3", 45 | "tensorflow": "tensorflow>=2.3", 46 | "timeout-decorator": "timeout-decorator", 47 | "tokenizers": "tokenizers==0.9.4", 48 | "torch": "torch>=1.0", 49 | "tqdm": "tqdm>=4.27", 50 | "unidic": "unidic>=1.0.2", 51 | "unidic_lite": "unidic_lite>=1.0.7", 52 | "uvicorn": "uvicorn", 53 | } 54 | -------------------------------------------------------------------------------- /transformers/commands/download.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from argparse import ArgumentParser 16 | 17 | from . import BaseTransformersCLICommand 18 | 19 | 20 | def download_command_factory(args): 21 | return DownloadCommand(args.model, args.cache_dir, args.force) 22 | 23 | 24 | class DownloadCommand(BaseTransformersCLICommand): 25 | @staticmethod 26 | def register_subcommand(parser: ArgumentParser): 27 | download_parser = parser.add_parser("download") 28 | download_parser.add_argument( 29 | "--cache-dir", type=str, default=None, help="Path to location to store the models" 30 | ) 31 | download_parser.add_argument( 32 | "--force", action="store_true", help="Force the model to be download even if already in cache-dir" 33 | ) 34 | download_parser.add_argument("model", type=str, help="Name of the model to download") 35 | download_parser.set_defaults(func=download_command_factory) 36 | 37 | def __init__(self, model: str, cache: str, force: bool): 38 | self._model = model 39 | self._cache = cache 40 | self._force = force 41 | 42 | def run(self): 43 | from ..models.auto import AutoModel, AutoTokenizer 44 | 45 | AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 46 | AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 47 | -------------------------------------------------------------------------------- /transformers/commands/transformers_cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from argparse import ArgumentParser 17 | 18 | from .add_new_model import AddNewModelCommand 19 | from .convert import ConvertCommand 20 | from .download import DownloadCommand 21 | from .env import EnvironmentCommand 22 | from .lfs import LfsCommands 23 | from .run import RunCommand 24 | from .serving import ServeCommand 25 | from .user import UserCommands 26 | 27 | 28 | def main(): 29 | parser = ArgumentParser("Transformers CLI tool", usage="transformers-cli []") 30 | commands_parser = parser.add_subparsers(help="transformers-cli command helpers") 31 | 32 | # Register commands 33 | ConvertCommand.register_subcommand(commands_parser) 34 | DownloadCommand.register_subcommand(commands_parser) 35 | EnvironmentCommand.register_subcommand(commands_parser) 36 | RunCommand.register_subcommand(commands_parser) 37 | ServeCommand.register_subcommand(commands_parser) 38 | UserCommands.register_subcommand(commands_parser) 39 | AddNewModelCommand.register_subcommand(commands_parser) 40 | LfsCommands.register_subcommand(commands_parser) 41 | 42 | # Let's go 43 | args = parser.parse_args() 44 | 45 | if not hasattr(args, "func"): 46 | parser.print_help() 47 | exit(1) 48 | 49 | # Run 50 | service = args.func(args) 51 | service.run() 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /transformers/models/led/tokenization_led.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for LED.""" 16 | from ...utils import logging 17 | from ..bart.tokenization_bart import BartTokenizer 18 | 19 | 20 | logger = logging.get_logger(__name__) 21 | 22 | PRETRAINED_VOCAB_FILES_MAP = { 23 | "vocab_file": { 24 | "allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/vocab.json", 25 | }, 26 | "merges_file": { 27 | "allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/merges.txt", 28 | }, 29 | "tokenizer_file": { 30 | "allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/tokenizer.json", 31 | }, 32 | } 33 | 34 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 35 | "allenai/led-base-16384": 16384, 36 | } 37 | 38 | 39 | class LEDTokenizer(BartTokenizer): 40 | """ 41 | Construct a LED tokenizer. 42 | 43 | :class:`~transformers.LEDTokenizer` is identical to :class:`~transformers.BartTokenizer` and runs end-to-end 44 | tokenization: punctuation splitting and wordpiece. 45 | 46 | Refer to superclass :class:`~transformers.BartTokenizer` for usage examples and documentation concerning 47 | parameters. 48 | """ 49 | 50 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 51 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 52 | -------------------------------------------------------------------------------- /transformers/models/retribert/tokenization_retribert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for RetriBERT.""" 16 | 17 | from ...utils import logging 18 | from ..bert.tokenization_bert import BertTokenizer 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 24 | 25 | PRETRAINED_VOCAB_FILES_MAP = { 26 | "vocab_file": { 27 | "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", 28 | } 29 | } 30 | 31 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 32 | "yjernite/retribert-base-uncased": 512, 33 | } 34 | 35 | 36 | PRETRAINED_INIT_CONFIGURATION = { 37 | "yjernite/retribert-base-uncased": {"do_lower_case": True}, 38 | } 39 | 40 | 41 | class RetriBertTokenizer(BertTokenizer): 42 | r""" 43 | Constructs a RetriBERT tokenizer. 44 | 45 | :class:`~transformers.RetroBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 46 | tokenization: punctuation splitting and wordpiece. 47 | 48 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 49 | parameters. 50 | """ 51 | 52 | vocab_files_names = VOCAB_FILES_NAMES 53 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 54 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 55 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 56 | model_input_names = ["attention_mask"] 57 | -------------------------------------------------------------------------------- /transformers/models/xlm_roberta/configuration_xlm_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XLM-RoBERTa configuration """ 17 | 18 | from ...utils import logging 19 | from ..roberta.configuration_roberta import RobertaConfig 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 25 | "xlm-roberta-base": "https://huggingface.co/xlm-roberta-base/resolve/main/config.json", 26 | "xlm-roberta-large": "https://huggingface.co/xlm-roberta-large/resolve/main/config.json", 27 | "xlm-roberta-large-finetuned-conll02-dutch": "https://huggingface.co/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/config.json", 28 | "xlm-roberta-large-finetuned-conll02-spanish": "https://huggingface.co/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/config.json", 29 | "xlm-roberta-large-finetuned-conll03-english": "https://huggingface.co/xlm-roberta-large-finetuned-conll03-english/resolve/main/config.json", 30 | "xlm-roberta-large-finetuned-conll03-german": "https://huggingface.co/xlm-roberta-large-finetuned-conll03-german/resolve/main/config.json", 31 | } 32 | 33 | 34 | class XLMRobertaConfig(RobertaConfig): 35 | """ 36 | This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate 37 | documentation alongside usage examples. 38 | """ 39 | 40 | model_type = "xlm-roberta" 41 | -------------------------------------------------------------------------------- /transformers/models/barthez/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_sentencepiece_available, is_tokenizers_available 22 | 23 | 24 | _import_structure = {} 25 | 26 | if is_sentencepiece_available(): 27 | _import_structure["tokenization_barthez"] = ["BarthezTokenizer"] 28 | 29 | if is_tokenizers_available(): 30 | _import_structure["tokenization_barthez_fast"] = ["BarthezTokenizerFast"] 31 | 32 | 33 | if TYPE_CHECKING: 34 | 35 | if is_sentencepiece_available(): 36 | from .tokenization_barthez import BarthezTokenizer 37 | 38 | if is_tokenizers_available(): 39 | from .tokenization_barthez_fast import BarthezTokenizerFast 40 | 41 | else: 42 | import importlib 43 | import os 44 | import sys 45 | 46 | class _LazyModule(_BaseLazyModule): 47 | """ 48 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 49 | """ 50 | 51 | __file__ = globals()["__file__"] 52 | __path__ = [os.path.dirname(__file__)] 53 | 54 | def _get_module(self, module_name: str): 55 | return importlib.import_module("." + module_name, self.__name__) 56 | 57 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Finetune-Transformers 2 | 3 | ## Finetuning and evaluating transformers on summarization task 4 | The main objective of this module is to fine-tune and evaluate a model (pre-trained on a large-scale dataset) on domain-specific data. Finetuning will improve the performance of the model on domain specific tasks. The pre-trained models can be finetuned on a number of downstream tasks based on their architecture. 5 | Here, I have taken an example of finetuning sequence-to-sequence models such as T5, BART, Pegasus on an abstractive summarization task using the Trainer API from [Hugging Face](https://huggingface.co/transformers/main_classes/trainer.html). 6 | 7 | * A number of pre-trained models can be finetuned such as: 8 | * T5 (small, base, large, 3B, 11B) 9 | * BART (base, large-cnn, large-mnli) 10 | * Longformer Encoder Decoder (allenai/led-base-16384, allenai/led-large-16384) 11 | * Pegasus (large, xsum, multi_news) 12 | 13 | Checkout [pre-trained models](https://huggingface.co/models) to see the checkpoints available for each of them. 14 | *** 15 | ## Script 16 | Finetuning with custom dataset placed at [`data/`](https://github.com/nsi319/Finetune-Transformers/tree/main/data): 17 | 18 | ```bash 19 | python run.py \ 20 | --model_name_or_path facebook/bart-base \ 21 | --train_file data/news_summary_train_small.csv \ 22 | --validation_file data/news_summary_valid_small.csv \ 23 | --text_column Text \ 24 | --summary_column Summary \ 25 | --output_dir output/ \ 26 | --overwrite_output_dir \ 27 | --do_train \ 28 | --do_eval \ 29 | --num_beams=3 \ 30 | --min_summ_length=100 \ 31 | --max_summ_length=250 \ 32 | --length_penalty=1.0 \ 33 | --per_device_train_batch_size=4 \ 34 | --per_device_eval_batch_size=4 \ 35 | --predict_with_generate 36 | ``` 37 | 38 | To see all the possible command line options, run: 39 | 40 | ```bash 41 | python run.py --help 42 | ``` 43 | If you are using **Google Colab**, Open [`colab/finetuning.ipynb`](https://github.com/nsi319/Finetune-Transformers/blob/main/colab/finetuning.ipynb) in Colab, save a copy in Drive and follow the instructions. 44 | 45 | 46 | -------------------------------------------------------------------------------- /transformers/models/bart/tokenization_bart.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from ...utils import logging 17 | from ..roberta.tokenization_roberta import RobertaTokenizer 18 | 19 | 20 | logger = logging.get_logger(__name__) 21 | 22 | 23 | # vocab and merges same as roberta 24 | vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json" 25 | merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt" 26 | _all_bart_models = [ 27 | "facebook/bart-base", 28 | "facebook/bart-large", 29 | "facebook/bart-large-mnli", 30 | "facebook/bart-large-cnn", 31 | "facebook/bart-large-xsum", 32 | "yjernite/bart_eli5", 33 | # This is not exhaustive: see https://huggingface.co/models?filter=bart 34 | ] 35 | 36 | 37 | class BartTokenizer(RobertaTokenizer): 38 | r""" 39 | Construct a BART tokenizer. 40 | 41 | :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new 42 | :meth:`~transformers.BartTokenizer.prepare_seq2seq_batch` 43 | 44 | Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the 45 | initialization parameters and other methods. 46 | """ 47 | # merges and vocab same as Roberta 48 | max_model_input_sizes = {m: 1024 for m in _all_bart_models} 49 | pretrained_vocab_files_map = { 50 | "vocab_file": {m: vocab_url for m in _all_bart_models}, 51 | "merges_file": {m: merges_url for m in _all_bart_models}, 52 | } 53 | -------------------------------------------------------------------------------- /transformers/models/fsmt/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_fsmt": ["FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FSMTConfig"], 26 | "tokenization_fsmt": ["FSMTTokenizer"], 27 | } 28 | 29 | if is_torch_available(): 30 | _import_structure["modeling_fsmt"] = ["FSMTForConditionalGeneration", "FSMTModel", "PretrainedFSMTModel"] 31 | 32 | 33 | if TYPE_CHECKING: 34 | from .configuration_fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTConfig 35 | from .tokenization_fsmt import FSMTTokenizer 36 | 37 | if is_torch_available(): 38 | from .modeling_fsmt import FSMTForConditionalGeneration, FSMTModel, PretrainedFSMTModel 39 | 40 | else: 41 | import importlib 42 | import os 43 | import sys 44 | 45 | class _LazyModule(_BaseLazyModule): 46 | """ 47 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 48 | """ 49 | 50 | __file__ = globals()["__file__"] 51 | __path__ = [os.path.dirname(__file__)] 52 | 53 | def _get_module(self, module_name: str): 54 | return importlib.import_module("." + module_name, self.__name__) 55 | 56 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 57 | -------------------------------------------------------------------------------- /transformers/models/rag/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_rag": ["RagConfig"], 26 | "retrieval_rag": ["RagRetriever"], 27 | "tokenization_rag": ["RagTokenizer"], 28 | } 29 | 30 | if is_torch_available(): 31 | _import_structure["modeling_rag"] = ["RagModel", "RagSequenceForGeneration", "RagTokenForGeneration"] 32 | 33 | 34 | if TYPE_CHECKING: 35 | from .configuration_rag import RagConfig 36 | from .retrieval_rag import RagRetriever 37 | from .tokenization_rag import RagTokenizer 38 | 39 | if is_torch_available(): 40 | from .modeling_rag import RagModel, RagSequenceForGeneration, RagTokenForGeneration 41 | 42 | else: 43 | import importlib 44 | import os 45 | import sys 46 | 47 | class _LazyModule(_BaseLazyModule): 48 | """ 49 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 50 | """ 51 | 52 | __file__ = globals()["__file__"] 53 | __path__ = [os.path.dirname(__file__)] 54 | 55 | def _get_module(self, module_name: str): 56 | return importlib.import_module("." + module_name, self.__name__) 57 | 58 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 59 | -------------------------------------------------------------------------------- /transformers/models/led/tokenization_led_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for LED.""" 16 | from ...utils import logging 17 | from ..bart.tokenization_bart_fast import BartTokenizerFast 18 | from .tokenization_led import LEDTokenizer 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | PRETRAINED_VOCAB_FILES_MAP = { 24 | "vocab_file": { 25 | "allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/vocab.json", 26 | }, 27 | "merges_file": { 28 | "allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/merges.txt", 29 | }, 30 | "tokenizer_file": { 31 | "allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/tokenizer.json", 32 | }, 33 | } 34 | 35 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 36 | "allenai/led-base-16384": 16384, 37 | } 38 | 39 | 40 | class LEDTokenizerFast(BartTokenizerFast): 41 | r""" 42 | Construct a "fast" LED tokenizer (backed by HuggingFace's `tokenizers` library). 43 | 44 | :class:`~transformers.LEDTokenizerFast` is identical to :class:`~transformers.BartTokenizerFast` and runs 45 | end-to-end tokenization: punctuation splitting and wordpiece. 46 | 47 | Refer to superclass :class:`~transformers.BartTokenizerFast` for usage examples and documentation concerning 48 | parameters. 49 | """ 50 | 51 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 52 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 53 | slow_tokenizer_class = LEDTokenizer 54 | -------------------------------------------------------------------------------- /transformers/models/layoutlm/tokenization_layoutlm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Tokenization class for model LayoutLM.""" 16 | 17 | 18 | from ...utils import logging 19 | from ..bert.tokenization_bert import BertTokenizer 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 25 | 26 | PRETRAINED_VOCAB_FILES_MAP = { 27 | "vocab_file": { 28 | "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", 29 | "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", 30 | } 31 | } 32 | 33 | 34 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 35 | "microsoft/layoutlm-base-uncased": 512, 36 | "microsoft/layoutlm-large-uncased": 512, 37 | } 38 | 39 | 40 | PRETRAINED_INIT_CONFIGURATION = { 41 | "microsoft/layoutlm-base-uncased": {"do_lower_case": True}, 42 | "microsoft/layoutlm-large-uncased": {"do_lower_case": True}, 43 | } 44 | 45 | 46 | class LayoutLMTokenizer(BertTokenizer): 47 | r""" 48 | Constructs a LayoutLM tokenizer. 49 | 50 | :class:`~transformers.LayoutLMTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 51 | tokenization: punctuation splitting + wordpiece. 52 | 53 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 54 | parameters. 55 | """ 56 | 57 | vocab_files_names = VOCAB_FILES_NAMES 58 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 59 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 60 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 61 | -------------------------------------------------------------------------------- /transformers/models/mobilebert/tokenization_mobilebert_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # 3 | # Copyright 2020 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """Tokenization classes for MobileBERT.""" 17 | 18 | from ...utils import logging 19 | from ..bert.tokenization_bert_fast import BertTokenizerFast 20 | from .tokenization_mobilebert import MobileBertTokenizer 21 | 22 | 23 | logger = logging.get_logger(__name__) 24 | 25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} 26 | 27 | PRETRAINED_VOCAB_FILES_MAP = { 28 | "vocab_file": {"mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/vocab.txt"}, 29 | "tokenizer_file": { 30 | "mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/tokenizer.json" 31 | }, 32 | } 33 | 34 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512} 35 | 36 | 37 | PRETRAINED_INIT_CONFIGURATION = {} 38 | 39 | 40 | class MobileBertTokenizerFast(BertTokenizerFast): 41 | r""" 42 | Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's `tokenizers` library). 43 | 44 | :class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs 45 | end-to-end tokenization: punctuation splitting and wordpiece. 46 | 47 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning 48 | parameters. 49 | """ 50 | 51 | vocab_files_names = VOCAB_FILES_NAMES 52 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 53 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 54 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 55 | slow_tokenizer_class = MobileBertTokenizer 56 | -------------------------------------------------------------------------------- /transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | 17 | import torch 18 | 19 | from transformers import BartForConditionalGeneration, MBartConfig 20 | from transformers.models.bart.convert_bart_original_pytorch_checkpoint_to_pytorch import remove_ignore_keys_ 21 | 22 | 23 | def convert_fairseq_mbart_checkpoint_from_disk(checkpoint_path, hf_config_path="facebook/mbart-large-en-ro"): 24 | state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] 25 | remove_ignore_keys_(state_dict) 26 | vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0] 27 | mbart_config = MBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size) 28 | state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] 29 | model = BartForConditionalGeneration(mbart_config) 30 | model.model.load_state_dict(state_dict) 31 | return model 32 | 33 | 34 | if __name__ == "__main__": 35 | parser = argparse.ArgumentParser() 36 | # Required parameters 37 | parser.add_argument( 38 | "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem." 39 | ) 40 | parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") 41 | parser.add_argument( 42 | "--hf_config", 43 | default="facebook/mbart-large-cc25", 44 | type=str, 45 | help="Which huggingface architecture to use: bart-large-xsum", 46 | ) 47 | args = parser.parse_args() 48 | model = convert_fairseq_mbart_checkpoint_from_disk(args.fairseq_path, hf_config_path=args.hf_config) 49 | model.save_pretrained(args.pytorch_dump_folder_path) 50 | -------------------------------------------------------------------------------- /transformers/models/longformer/tokenization_longformer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from ...utils import logging 17 | from ..roberta.tokenization_roberta import RobertaTokenizer 18 | 19 | 20 | logger = logging.get_logger(__name__) 21 | 22 | 23 | # vocab and merges same as roberta 24 | vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json" 25 | merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt" 26 | _all_longformer_models = [ 27 | "allenai/longformer-base-4096", 28 | "allenai/longformer-large-4096", 29 | "allenai/longformer-large-4096-finetuned-triviaqa", 30 | "allenai/longformer-base-4096-extra.pos.embd.only", 31 | "allenai/longformer-large-4096-extra.pos.embd.only", 32 | ] 33 | 34 | 35 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 36 | "allenai/longformer-base-4096": 4096, 37 | "allenai/longformer-large-4096": 4096, 38 | "allenai/longformer-large-4096-finetuned-triviaqa": 4096, 39 | "allenai/longformer-base-4096-extra.pos.embd.only": 4096, 40 | "allenai/longformer-large-4096-extra.pos.embd.only": 4096, 41 | } 42 | 43 | 44 | class LongformerTokenizer(RobertaTokenizer): 45 | r""" 46 | Construct a Longformer tokenizer. 47 | 48 | :class:`~transformers.LongformerTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to the 49 | superclass for usage examples and documentation concerning parameters. 50 | """ 51 | # merges and vocab same as Roberta 52 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 53 | pretrained_vocab_files_map = { 54 | "vocab_file": {m: vocab_url for m in _all_longformer_models}, 55 | "merges_file": {m: merges_url for m in _all_longformer_models}, 56 | } 57 | -------------------------------------------------------------------------------- /transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert T5 checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5 21 | from transformers.utils import logging 22 | 23 | 24 | logging.set_verbosity_info() 25 | 26 | 27 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): 28 | # Initialise PyTorch model 29 | config = T5Config.from_json_file(config_file) 30 | print("Building PyTorch model from configuration: {}".format(str(config))) 31 | model = T5ForConditionalGeneration(config) 32 | 33 | # Load weights from tf checkpoint 34 | load_tf_weights_in_t5(model, config, tf_checkpoint_path) 35 | 36 | # Save pytorch-model 37 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 38 | model.save_pretrained(pytorch_dump_path) 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser() 43 | # Required parameters 44 | parser.add_argument( 45 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 46 | ) 47 | parser.add_argument( 48 | "--config_file", 49 | default=None, 50 | type=str, 51 | required=True, 52 | help="The config json file corresponding to the pre-trained T5 model. \n" 53 | "This specifies the model architecture.", 54 | ) 55 | parser.add_argument( 56 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 57 | ) 58 | args = parser.parse_args() 59 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) 60 | -------------------------------------------------------------------------------- /transformers/utils/model_parallel_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from math import ceil 17 | 18 | 19 | def assert_device_map(device_map, num_blocks): 20 | blocks = list(range(0, num_blocks)) 21 | 22 | device_map_blocks = [item for sublist in list(device_map.values()) for item in sublist] 23 | 24 | # Duplicate check 25 | duplicate_blocks = [] 26 | for i in device_map_blocks: 27 | if device_map_blocks.count(i) > 1 and i not in duplicate_blocks: 28 | duplicate_blocks.append(i) 29 | # Missing blocks 30 | missing_blocks = [i for i in blocks if i not in device_map_blocks] 31 | extra_blocks = [i for i in device_map_blocks if i not in blocks] 32 | 33 | assert len(duplicate_blocks) == 0, ( 34 | "Duplicate attention blocks specified in device_map. Attention blocks must be specified to one device. These " 35 | "attention blocks were specified more than once: " + str(duplicate_blocks) 36 | ) 37 | assert len(missing_blocks) == 0, ( 38 | "There are attention blocks for this model that are not specified in the device_map. Add these attention " 39 | "blocks to a device on the device_map: " + str(missing_blocks) 40 | ) 41 | assert ( 42 | len(extra_blocks) == 0 43 | ), "The device_map contains more attention blocks than this model has. Remove these from the device_map:" + str( 44 | extra_blocks 45 | ) 46 | 47 | 48 | def get_device_map(n_layers, devices): 49 | """Returns a dictionary of layers distributed evenly across all devices.""" 50 | layers = list(range(n_layers)) 51 | n_blocks = int(ceil(n_layers / len(devices))) 52 | layers_list = list(layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks)) 53 | 54 | return dict(zip(devices, layers_list)) 55 | -------------------------------------------------------------------------------- /transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert LXMERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import torch 21 | 22 | from transformers import LxmertConfig, LxmertForPreTraining, load_tf_weights_in_lxmert 23 | from transformers.utils import logging 24 | 25 | 26 | logging.set_verbosity_info() 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = LxmertConfig.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = LxmertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_lxmert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import torch 21 | 22 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert 23 | from transformers.utils import logging 24 | 25 | 26 | logging.set_verbosity_info() 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = BertConfig.from_json_file(bert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = BertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_bert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--bert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained BERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert ALBERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import torch 21 | 22 | from transformers import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert 23 | from transformers.utils import logging 24 | 25 | 26 | logging.set_verbosity_info() 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = AlbertConfig.from_json_file(albert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = AlbertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_albert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--albert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained ALBERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /transformers/models/mobilebert/convert_mobilebert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | 17 | import torch 18 | 19 | from transformers import MobileBertConfig, MobileBertForPreTraining, load_tf_weights_in_mobilebert 20 | from transformers.utils import logging 21 | 22 | 23 | logging.set_verbosity_info() 24 | 25 | 26 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, mobilebert_config_file, pytorch_dump_path): 27 | # Initialise PyTorch model 28 | config = MobileBertConfig.from_json_file(mobilebert_config_file) 29 | print("Building PyTorch model from configuration: {}".format(str(config))) 30 | model = MobileBertForPreTraining(config) 31 | # Load weights from tf checkpoint 32 | model = load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path) 33 | # Save pytorch-model 34 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 35 | torch.save(model.state_dict(), pytorch_dump_path) 36 | 37 | 38 | if __name__ == "__main__": 39 | parser = argparse.ArgumentParser() 40 | # Required parameters 41 | parser.add_argument( 42 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 43 | ) 44 | parser.add_argument( 45 | "--mobilebert_config_file", 46 | default=None, 47 | type=str, 48 | required=True, 49 | help="The config json file corresponding to the pre-trained MobileBERT model. \n" 50 | "This specifies the model architecture.", 51 | ) 52 | parser.add_argument( 53 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 54 | ) 55 | args = parser.parse_args() 56 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.mobilebert_config_file, args.pytorch_dump_path) 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /transformers/models/retribert/tokenization_retribert_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for RetriBERT.""" 16 | 17 | from ...utils import logging 18 | from ..bert.tokenization_bert_fast import BertTokenizerFast 19 | from .tokenization_retribert import RetriBertTokenizer 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} 25 | 26 | PRETRAINED_VOCAB_FILES_MAP = { 27 | "vocab_file": { 28 | "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", 29 | }, 30 | "tokenizer_file": { 31 | "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", 32 | }, 33 | } 34 | 35 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 36 | "yjernite/retribert-base-uncased": 512, 37 | } 38 | 39 | 40 | PRETRAINED_INIT_CONFIGURATION = { 41 | "yjernite/retribert-base-uncased": {"do_lower_case": True}, 42 | } 43 | 44 | 45 | class RetriBertTokenizerFast(BertTokenizerFast): 46 | r""" 47 | Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's `tokenizers` library). 48 | 49 | :class:`~transformers.RetriBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs 50 | end-to-end tokenization: punctuation splitting and wordpiece. 51 | 52 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning 53 | parameters. 54 | """ 55 | 56 | vocab_files_names = VOCAB_FILES_NAMES 57 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 58 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 59 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 60 | slow_tokenizer_class = RetriBertTokenizer 61 | model_input_names = ["attention_mask"] 62 | -------------------------------------------------------------------------------- /transformers/models/tapas/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_tapas": ["TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP", "TapasConfig"], 26 | "tokenization_tapas": ["TapasTokenizer"], 27 | } 28 | 29 | if is_torch_available(): 30 | _import_structure["modeling_tapas"] = [ 31 | "TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST", 32 | "TapasForMaskedLM", 33 | "TapasForQuestionAnswering", 34 | "TapasForSequenceClassification", 35 | "TapasModel", 36 | ] 37 | 38 | 39 | if TYPE_CHECKING: 40 | from .configuration_tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig 41 | from .tokenization_tapas import TapasTokenizer 42 | 43 | if is_torch_available(): 44 | from .modeling_tapas import ( 45 | TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST, 46 | TapasForMaskedLM, 47 | TapasForQuestionAnswering, 48 | TapasForSequenceClassification, 49 | TapasModel, 50 | ) 51 | 52 | else: 53 | import importlib 54 | import os 55 | import sys 56 | 57 | class _LazyModule(_BaseLazyModule): 58 | """ 59 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 60 | """ 61 | 62 | __file__ = globals()["__file__"] 63 | __path__ = [os.path.dirname(__file__)] 64 | 65 | def _get_module(self, module_name: str): 66 | return importlib.import_module("." + module_name, self.__name__) 67 | 68 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 69 | -------------------------------------------------------------------------------- /transformers/models/bert_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_sentencepiece_available, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_bert_generation": ["BertGenerationConfig"], 26 | } 27 | 28 | if is_sentencepiece_available(): 29 | _import_structure["tokenization_bert_generation"] = ["BertGenerationTokenizer"] 30 | 31 | if is_torch_available(): 32 | _import_structure["modeling_bert_generation"] = [ 33 | "BertGenerationDecoder", 34 | "BertGenerationEncoder", 35 | "load_tf_weights_in_bert_generation", 36 | ] 37 | 38 | 39 | if TYPE_CHECKING: 40 | from .configuration_bert_generation import BertGenerationConfig 41 | 42 | if is_sentencepiece_available(): 43 | from .tokenization_bert_generation import BertGenerationTokenizer 44 | 45 | if is_torch_available(): 46 | from .modeling_bert_generation import ( 47 | BertGenerationDecoder, 48 | BertGenerationEncoder, 49 | load_tf_weights_in_bert_generation, 50 | ) 51 | 52 | else: 53 | import importlib 54 | import os 55 | import sys 56 | 57 | class _LazyModule(_BaseLazyModule): 58 | """ 59 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 60 | """ 61 | 62 | __file__ = globals()["__file__"] 63 | __path__ = [os.path.dirname(__file__)] 64 | 65 | def _get_module(self, module_name: str): 66 | return importlib.import_module("." + module_name, self.__name__) 67 | 68 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 69 | -------------------------------------------------------------------------------- /transformers/models/squeezebert/tokenization_squeezebert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for SqueezeBERT.""" 16 | 17 | from ...utils import logging 18 | from ..bert.tokenization_bert import BertTokenizer 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 24 | 25 | PRETRAINED_VOCAB_FILES_MAP = { 26 | "vocab_file": { 27 | "squeezebert/squeezebert-uncased": "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/vocab.txt", 28 | "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/vocab.txt", 29 | "squeezebert/squeezebert-mnli-headless": "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/vocab.txt", 30 | } 31 | } 32 | 33 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 34 | "squeezebert/squeezebert-uncased": 512, 35 | "squeezebert/squeezebert-mnli": 512, 36 | "squeezebert/squeezebert-mnli-headless": 512, 37 | } 38 | 39 | 40 | PRETRAINED_INIT_CONFIGURATION = { 41 | "squeezebert/squeezebert-uncased": {"do_lower_case": True}, 42 | "squeezebert/squeezebert-mnli": {"do_lower_case": True}, 43 | "squeezebert/squeezebert-mnli-headless": {"do_lower_case": True}, 44 | } 45 | 46 | 47 | class SqueezeBertTokenizer(BertTokenizer): 48 | r""" 49 | Constructs a SqueezeBert tokenizer. 50 | 51 | :class:`~transformers.SqueezeBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 52 | tokenization: punctuation splitting + wordpiece. 53 | 54 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 55 | parameters. 56 | """ 57 | 58 | vocab_files_names = VOCAB_FILES_NAMES 59 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 60 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 61 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 62 | -------------------------------------------------------------------------------- /transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert Funnel checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import torch 21 | 22 | from transformers import FunnelBaseModel, FunnelConfig, FunnelModel, load_tf_weights_in_funnel 23 | from transformers.utils import logging 24 | 25 | 26 | logging.set_verbosity_info() 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, base_model): 30 | # Initialise PyTorch model 31 | config = FunnelConfig.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = FunnelBaseModel(config) if base_model else FunnelModel(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_funnel(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | parser.add_argument( 61 | "--base_model", action="store_true", help="Whether you want just the base model (no decoder) or not." 62 | ) 63 | args = parser.parse_args() 64 | convert_tf_checkpoint_to_pytorch( 65 | args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.base_model 66 | ) 67 | -------------------------------------------------------------------------------- /transformers/models/retribert/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_tokenizers_available, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig"], 26 | "tokenization_retribert": ["RetriBertTokenizer"], 27 | } 28 | 29 | if is_tokenizers_available(): 30 | _import_structure["tokenization_retribert_fast"] = ["RetriBertTokenizerFast"] 31 | 32 | if is_torch_available(): 33 | _import_structure["modeling_retribert"] = [ 34 | "RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST", 35 | "RetriBertModel", 36 | "RetriBertPreTrainedModel", 37 | ] 38 | 39 | 40 | if TYPE_CHECKING: 41 | from .configuration_retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig 42 | from .tokenization_retribert import RetriBertTokenizer 43 | 44 | if is_tokenizers_available(): 45 | from .tokenization_retribert_fast import RetriBertTokenizerFast 46 | 47 | if is_torch_available(): 48 | from .modeling_retribert import ( 49 | RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST, 50 | RetriBertModel, 51 | RetriBertPreTrainedModel, 52 | ) 53 | 54 | else: 55 | import importlib 56 | import os 57 | import sys 58 | 59 | class _LazyModule(_BaseLazyModule): 60 | """ 61 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 62 | """ 63 | 64 | __file__ = globals()["__file__"] 65 | __path__ = [os.path.dirname(__file__)] 66 | 67 | def _get_module(self, module_name: str): 68 | return importlib.import_module("." + module_name, self.__name__) 69 | 70 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 71 | -------------------------------------------------------------------------------- /transformers/models/deberta/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig"], 26 | "tokenization_deberta": ["DebertaTokenizer"], 27 | } 28 | 29 | if is_torch_available(): 30 | _import_structure["modeling_deberta"] = [ 31 | "DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST", 32 | "DebertaForSequenceClassification", 33 | "DebertaModel", 34 | "DebertaForMaskedLM", 35 | "DebertaPreTrainedModel", 36 | "DebertaForTokenClassification", 37 | "DebertaForQuestionAnswering", 38 | ] 39 | 40 | 41 | if TYPE_CHECKING: 42 | from .configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig 43 | from .tokenization_deberta import DebertaTokenizer 44 | 45 | if is_torch_available(): 46 | from .modeling_deberta import ( 47 | DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, 48 | DebertaForMaskedLM, 49 | DebertaForQuestionAnswering, 50 | DebertaForSequenceClassification, 51 | DebertaForTokenClassification, 52 | DebertaModel, 53 | DebertaPreTrainedModel, 54 | ) 55 | 56 | else: 57 | import importlib 58 | import os 59 | import sys 60 | 61 | class _LazyModule(_BaseLazyModule): 62 | """ 63 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 64 | """ 65 | 66 | __file__ = globals()["__file__"] 67 | __path__ = [os.path.dirname(__file__)] 68 | 69 | def _get_module(self, module_name: str): 70 | return importlib.import_module("." + module_name, self.__name__) 71 | 72 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 73 | -------------------------------------------------------------------------------- /transformers/models/longformer/tokenization_longformer_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from ...utils import logging 17 | from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast 18 | from .tokenization_longformer import LongformerTokenizer 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | 24 | # vocab and merges same as roberta 25 | vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json" 26 | merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt" 27 | tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json" 28 | _all_longformer_models = [ 29 | "allenai/longformer-base-4096", 30 | "allenai/longformer-large-4096", 31 | "allenai/longformer-large-4096-finetuned-triviaqa", 32 | "allenai/longformer-base-4096-extra.pos.embd.only", 33 | "allenai/longformer-large-4096-extra.pos.embd.only", 34 | ] 35 | 36 | 37 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 38 | "allenai/longformer-base-4096": 4096, 39 | "allenai/longformer-large-4096": 4096, 40 | "allenai/longformer-large-4096-finetuned-triviaqa": 4096, 41 | "allenai/longformer-base-4096-extra.pos.embd.only": 4096, 42 | "allenai/longformer-large-4096-extra.pos.embd.only": 4096, 43 | } 44 | 45 | 46 | class LongformerTokenizerFast(RobertaTokenizerFast): 47 | r""" 48 | Construct a "fast" Longformer tokenizer (backed by HuggingFace's `tokenizers` library). 49 | 50 | :class:`~transformers.LongformerTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer 51 | to the superclass for usage examples and documentation concerning parameters. 52 | """ 53 | # merges and vocab same as Roberta 54 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 55 | pretrained_vocab_files_map = { 56 | "vocab_file": {m: vocab_url for m in _all_longformer_models}, 57 | "merges_file": {m: merges_url for m in _all_longformer_models}, 58 | "tokenizer_file": {m: tokenizer_url for m in _all_longformer_models}, 59 | } 60 | slow_tokenizer_class = LongformerTokenizer 61 | -------------------------------------------------------------------------------- /transformers/models/prophetnet/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_prophetnet": ["PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ProphetNetConfig"], 26 | "tokenization_prophetnet": ["ProphetNetTokenizer"], 27 | } 28 | 29 | if is_torch_available(): 30 | _import_structure["modeling_prophetnet"] = [ 31 | "PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST", 32 | "ProphetNetDecoder", 33 | "ProphetNetEncoder", 34 | "ProphetNetForCausalLM", 35 | "ProphetNetForConditionalGeneration", 36 | "ProphetNetModel", 37 | "ProphetNetPreTrainedModel", 38 | ] 39 | 40 | 41 | if TYPE_CHECKING: 42 | from .configuration_prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig 43 | from .tokenization_prophetnet import ProphetNetTokenizer 44 | 45 | if is_torch_available(): 46 | from .modeling_prophetnet import ( 47 | PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST, 48 | ProphetNetDecoder, 49 | ProphetNetEncoder, 50 | ProphetNetForCausalLM, 51 | ProphetNetForConditionalGeneration, 52 | ProphetNetModel, 53 | ProphetNetPreTrainedModel, 54 | ) 55 | 56 | else: 57 | import importlib 58 | import os 59 | import sys 60 | 61 | class _LazyModule(_BaseLazyModule): 62 | """ 63 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 64 | """ 65 | 66 | __file__ = globals()["__file__"] 67 | __path__ = [os.path.dirname(__file__)] 68 | 69 | def _get_module(self, module_name: str): 70 | return importlib.import_module("." + module_name, self.__name__) 71 | 72 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 73 | -------------------------------------------------------------------------------- /transformers/models/layoutlm/tokenization_layoutlm_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Tokenization class for model LayoutLM.""" 16 | 17 | 18 | from ...utils import logging 19 | from ..bert.tokenization_bert_fast import BertTokenizerFast 20 | from .tokenization_layoutlm import LayoutLMTokenizer 21 | 22 | 23 | logger = logging.get_logger(__name__) 24 | 25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} 26 | 27 | PRETRAINED_VOCAB_FILES_MAP = { 28 | "vocab_file": { 29 | "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", 30 | "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", 31 | }, 32 | "tokenizer_file": { 33 | "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", 34 | "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/tokenizer.json", 35 | }, 36 | } 37 | 38 | 39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 40 | "microsoft/layoutlm-base-uncased": 512, 41 | "microsoft/layoutlm-large-uncased": 512, 42 | } 43 | 44 | 45 | PRETRAINED_INIT_CONFIGURATION = { 46 | "microsoft/layoutlm-base-uncased": {"do_lower_case": True}, 47 | "microsoft/layoutlm-large-uncased": {"do_lower_case": True}, 48 | } 49 | 50 | 51 | class LayoutLMTokenizerFast(BertTokenizerFast): 52 | r""" 53 | Constructs a "Fast" LayoutLMTokenizer. 54 | 55 | :class:`~transformers.LayoutLMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs 56 | end-to-end tokenization: punctuation splitting + wordpiece. 57 | 58 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning 59 | parameters. 60 | """ 61 | 62 | vocab_files_names = VOCAB_FILES_NAMES 63 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 64 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 65 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 66 | slow_tokenizer_class = LayoutLMTokenizer 67 | -------------------------------------------------------------------------------- /transformers/models/blenderbot/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_tf_available, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_blenderbot": ["BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BlenderbotConfig"], 26 | "tokenization_blenderbot": ["BlenderbotTokenizer"], 27 | } 28 | 29 | if is_torch_available(): 30 | _import_structure["modeling_blenderbot"] = [ 31 | "BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST", 32 | "BlenderbotForConditionalGeneration", 33 | "BlenderbotModel", 34 | "BlenderbotPreTrainedModel", 35 | ] 36 | 37 | 38 | if is_tf_available(): 39 | _import_structure["modeling_tf_blenderbot"] = ["TFBlenderbotForConditionalGeneration", "TFBlenderbotModel"] 40 | 41 | 42 | if TYPE_CHECKING: 43 | from .configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig 44 | from .tokenization_blenderbot import BlenderbotTokenizer 45 | 46 | if is_torch_available(): 47 | from .modeling_blenderbot import ( 48 | BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST, 49 | BlenderbotForConditionalGeneration, 50 | BlenderbotModel, 51 | BlenderbotPreTrainedModel, 52 | ) 53 | 54 | if is_tf_available(): 55 | from .modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration, TFBlenderbotModel 56 | 57 | else: 58 | import importlib 59 | import os 60 | import sys 61 | 62 | class _LazyModule(_BaseLazyModule): 63 | """ 64 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 65 | """ 66 | 67 | __file__ = globals()["__file__"] 68 | __path__ = [os.path.dirname(__file__)] 69 | 70 | def _get_module(self, module_name: str): 71 | return importlib.import_module("." + module_name, self.__name__) 72 | 73 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 74 | -------------------------------------------------------------------------------- /transformers/models/layoutlm/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_tokenizers_available, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_layoutlm": ["LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "LayoutLMConfig"], 26 | "tokenization_layoutlm": ["LayoutLMTokenizer"], 27 | } 28 | 29 | if is_tokenizers_available(): 30 | _import_structure["tokenization_layoutlm_fast"] = ["LayoutLMTokenizerFast"] 31 | 32 | if is_torch_available(): 33 | _import_structure["modeling_layoutlm"] = [ 34 | "LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST", 35 | "LayoutLMForMaskedLM", 36 | "LayoutLMForSequenceClassification", 37 | "LayoutLMForTokenClassification", 38 | "LayoutLMModel", 39 | ] 40 | 41 | 42 | if TYPE_CHECKING: 43 | from .configuration_layoutlm import LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMConfig 44 | from .tokenization_layoutlm import LayoutLMTokenizer 45 | 46 | if is_tokenizers_available(): 47 | from .tokenization_layoutlm_fast import LayoutLMTokenizerFast 48 | 49 | if is_torch_available(): 50 | from .modeling_layoutlm import ( 51 | LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST, 52 | LayoutLMForMaskedLM, 53 | LayoutLMForSequenceClassification, 54 | LayoutLMForTokenClassification, 55 | LayoutLMModel, 56 | ) 57 | 58 | else: 59 | import importlib 60 | import os 61 | import sys 62 | 63 | class _LazyModule(_BaseLazyModule): 64 | """ 65 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 66 | """ 67 | 68 | __file__ = globals()["__file__"] 69 | __path__ = [os.path.dirname(__file__)] 70 | 71 | def _get_module(self, module_name: str): 72 | return importlib.import_module("." + module_name, self.__name__) 73 | 74 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 75 | -------------------------------------------------------------------------------- /transformers/models/lxmert/tokenization_lxmert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from ..bert.tokenization_bert import BertTokenizer 17 | 18 | 19 | #################################################### 20 | # Mapping from the keyword arguments names of Tokenizer `__init__` 21 | # to file names for serializing Tokenizer instances 22 | #################################################### 23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 24 | 25 | #################################################### 26 | # Mapping from the keyword arguments names of Tokenizer `__init__` 27 | # to pretrained vocabulary URL for all the model ids. 28 | #################################################### 29 | PRETRAINED_VOCAB_FILES_MAP = { 30 | "vocab_file": { 31 | "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", 32 | } 33 | } 34 | 35 | #################################################### 36 | # Mapping from model ids to max length of inputs 37 | #################################################### 38 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 39 | "unc-nlp/lxmert-base-uncased": 512, 40 | } 41 | #################################################### 42 | # Mapping from model ids to a dictionary of additional 43 | # keyword arguments for Tokenizer `__init__`. 44 | # To be used for checkpoint specific configurations. 45 | #################################################### 46 | PRETRAINED_INIT_CONFIGURATION = { 47 | "unc-nlp/lxmert-base-uncased": {"do_lower_case": True}, 48 | } 49 | 50 | 51 | class LxmertTokenizer(BertTokenizer): 52 | r""" 53 | Construct an LXMERT tokenizer. 54 | 55 | :class:`~transformers.LxmertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 56 | tokenization: punctuation splitting and wordpiece. 57 | 58 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 59 | parameters. 60 | """ 61 | 62 | vocab_files_names = VOCAB_FILES_NAMES 63 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 64 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 65 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 66 | -------------------------------------------------------------------------------- /transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import torch 21 | 22 | from transformers import GPT2Config, GPT2Model, load_tf_weights_in_gpt2 23 | from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME 24 | from transformers.utils import logging 25 | 26 | 27 | logging.set_verbosity_info() 28 | 29 | 30 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): 31 | # Construct model 32 | if gpt2_config_file == "": 33 | config = GPT2Config() 34 | else: 35 | config = GPT2Config.from_json_file(gpt2_config_file) 36 | model = GPT2Model(config) 37 | 38 | # Load weights from numpy 39 | load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) 40 | 41 | # Save pytorch-model 42 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 43 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 44 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 45 | torch.save(model.state_dict(), pytorch_weights_dump_path) 46 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 47 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 48 | f.write(config.to_json_string()) 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | # Required parameters 54 | parser.add_argument( 55 | "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | parser.add_argument( 61 | "--gpt2_config_file", 62 | default="", 63 | type=str, 64 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n" 65 | "This specifies the model architecture.", 66 | ) 67 | args = parser.parse_args() 68 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path) 69 | -------------------------------------------------------------------------------- /transformers/models/marian/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | from typing import TYPE_CHECKING 19 | 20 | from ...file_utils import ( 21 | _BaseLazyModule, 22 | is_sentencepiece_available, 23 | is_tf_available, 24 | is_tokenizers_available, 25 | is_torch_available, 26 | ) 27 | 28 | 29 | _import_structure = { 30 | "configuration_marian": ["MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "MarianConfig"], 31 | } 32 | 33 | if is_sentencepiece_available(): 34 | _import_structure["tokenization_marian"] = ["MarianTokenizer"] 35 | 36 | if is_torch_available(): 37 | _import_structure["modeling_marian"] = [ 38 | "MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST", 39 | "MarianModel", 40 | "MarianMTModel", 41 | "MarianPreTrainedModel", 42 | ] 43 | 44 | if is_tf_available(): 45 | _import_structure["modeling_tf_marian"] = ["TFMarianMTModel", "TFMarianModel"] 46 | 47 | 48 | if TYPE_CHECKING: 49 | from .configuration_marian import MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP, MarianConfig 50 | 51 | if is_sentencepiece_available(): 52 | from .tokenization_marian import MarianTokenizer 53 | 54 | if is_torch_available(): 55 | from .modeling_marian import ( 56 | MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST, 57 | MarianModel, 58 | MarianMTModel, 59 | MarianPreTrainedModel, 60 | ) 61 | 62 | if is_tf_available(): 63 | from .modeling_tf_marian import TFMarianModel, TFMarianMTModel 64 | 65 | else: 66 | import importlib 67 | import os 68 | import sys 69 | 70 | class _LazyModule(_BaseLazyModule): 71 | """ 72 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 73 | """ 74 | 75 | __file__ = globals()["__file__"] 76 | __path__ = [os.path.dirname(__file__)] 77 | 78 | def _get_module(self, module_name: str): 79 | return importlib.import_module("." + module_name, self.__name__) 80 | 81 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 82 | -------------------------------------------------------------------------------- /transformers/commands/env.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import platform 16 | from argparse import ArgumentParser 17 | 18 | from .. import __version__ as version 19 | from ..file_utils import is_tf_available, is_torch_available 20 | from . import BaseTransformersCLICommand 21 | 22 | 23 | def info_command_factory(_): 24 | return EnvironmentCommand() 25 | 26 | 27 | class EnvironmentCommand(BaseTransformersCLICommand): 28 | @staticmethod 29 | def register_subcommand(parser: ArgumentParser): 30 | download_parser = parser.add_parser("env") 31 | download_parser.set_defaults(func=info_command_factory) 32 | 33 | def run(self): 34 | pt_version = "not installed" 35 | pt_cuda_available = "NA" 36 | if is_torch_available(): 37 | import torch 38 | 39 | pt_version = torch.__version__ 40 | pt_cuda_available = torch.cuda.is_available() 41 | 42 | tf_version = "not installed" 43 | tf_cuda_available = "NA" 44 | if is_tf_available(): 45 | import tensorflow as tf 46 | 47 | tf_version = tf.__version__ 48 | try: 49 | # deprecated in v2.1 50 | tf_cuda_available = tf.test.is_gpu_available() 51 | except AttributeError: 52 | # returns list of devices, convert to bool 53 | tf_cuda_available = bool(tf.config.list_physical_devices("GPU")) 54 | 55 | info = { 56 | "`transformers` version": version, 57 | "Platform": platform.platform(), 58 | "Python version": platform.python_version(), 59 | "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available), 60 | "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available), 61 | "Using GPU in script?": "", 62 | "Using distributed or parallel set-up in script?": "", 63 | } 64 | 65 | print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n") 66 | print(self.format_dict(info)) 67 | 68 | return info 69 | 70 | @staticmethod 71 | def format_dict(d): 72 | return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n" 73 | -------------------------------------------------------------------------------- /transformers/models/blenderbot_small/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | from typing import TYPE_CHECKING 19 | 20 | from ...file_utils import _BaseLazyModule, is_tf_available, is_torch_available 21 | 22 | 23 | _import_structure = { 24 | "configuration_blenderbot_small": ["BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP", "BlenderbotSmallConfig"], 25 | "tokenization_blenderbot_small": ["BlenderbotSmallTokenizer"], 26 | } 27 | 28 | if is_torch_available(): 29 | _import_structure["modeling_blenderbot_small"] = [ 30 | "BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST", 31 | "BlenderbotSmallForConditionalGeneration", 32 | "BlenderbotSmallModel", 33 | "BlenderbotSmallPreTrainedModel", 34 | ] 35 | 36 | if is_tf_available(): 37 | _import_structure["modeling_tf_blenderbot_small"] = [ 38 | "TFBlenderbotSmallForConditionalGeneration", 39 | "TFBlenderbotSmallModel", 40 | ] 41 | 42 | if TYPE_CHECKING: 43 | from .configuration_blenderbot_small import BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotSmallConfig 44 | from .tokenization_blenderbot_small import BlenderbotSmallTokenizer 45 | 46 | if is_torch_available(): 47 | from .modeling_blenderbot_small import ( 48 | BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST, 49 | BlenderbotSmallForConditionalGeneration, 50 | BlenderbotSmallModel, 51 | BlenderbotSmallPreTrainedModel, 52 | ) 53 | 54 | if is_tf_available(): 55 | from .modeling_tf_blenderbot_small import TFBlenderbotSmallForConditionalGeneration, TFBlenderbotSmallModel 56 | 57 | else: 58 | import importlib 59 | import os 60 | import sys 61 | 62 | class _LazyModule(_BaseLazyModule): 63 | """ 64 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 65 | """ 66 | 67 | __file__ = globals()["__file__"] 68 | __path__ = [os.path.dirname(__file__)] 69 | 70 | def _get_module(self, module_name: str): 71 | return importlib.import_module("." + module_name, self.__name__) 72 | 73 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 74 | -------------------------------------------------------------------------------- /transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import torch 21 | 22 | from transformers import OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt 23 | from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME 24 | from transformers.utils import logging 25 | 26 | 27 | logging.set_verbosity_info() 28 | 29 | 30 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): 31 | # Construct model 32 | if openai_config_file == "": 33 | config = OpenAIGPTConfig() 34 | else: 35 | config = OpenAIGPTConfig.from_json_file(openai_config_file) 36 | model = OpenAIGPTModel(config) 37 | 38 | # Load weights from numpy 39 | load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path) 40 | 41 | # Save pytorch-model 42 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 43 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 44 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 45 | torch.save(model.state_dict(), pytorch_weights_dump_path) 46 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 47 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 48 | f.write(config.to_json_string()) 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | # Required parameters 54 | parser.add_argument( 55 | "--openai_checkpoint_folder_path", 56 | default=None, 57 | type=str, 58 | required=True, 59 | help="Path to the TensorFlow checkpoint path.", 60 | ) 61 | parser.add_argument( 62 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 63 | ) 64 | parser.add_argument( 65 | "--openai_config_file", 66 | default="", 67 | type=str, 68 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n" 69 | "This specifies the model architecture.", 70 | ) 71 | args = parser.parse_args() 72 | convert_openai_checkpoint_to_pytorch( 73 | args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path 74 | ) 75 | -------------------------------------------------------------------------------- /transformers/models/ctrl/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_tf_available, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig"], 26 | "tokenization_ctrl": ["CTRLTokenizer"], 27 | } 28 | 29 | if is_torch_available(): 30 | _import_structure["modeling_ctrl"] = [ 31 | "CTRL_PRETRAINED_MODEL_ARCHIVE_LIST", 32 | "CTRLForSequenceClassification", 33 | "CTRLLMHeadModel", 34 | "CTRLModel", 35 | "CTRLPreTrainedModel", 36 | ] 37 | 38 | if is_tf_available(): 39 | _import_structure["modeling_tf_ctrl"] = [ 40 | "TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST", 41 | "TFCTRLForSequenceClassification", 42 | "TFCTRLLMHeadModel", 43 | "TFCTRLModel", 44 | "TFCTRLPreTrainedModel", 45 | ] 46 | 47 | 48 | if TYPE_CHECKING: 49 | from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig 50 | from .tokenization_ctrl import CTRLTokenizer 51 | 52 | if is_torch_available(): 53 | from .modeling_ctrl import ( 54 | CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, 55 | CTRLForSequenceClassification, 56 | CTRLLMHeadModel, 57 | CTRLModel, 58 | CTRLPreTrainedModel, 59 | ) 60 | 61 | if is_tf_available(): 62 | from .modeling_tf_ctrl import ( 63 | TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, 64 | TFCTRLForSequenceClassification, 65 | TFCTRLLMHeadModel, 66 | TFCTRLModel, 67 | TFCTRLPreTrainedModel, 68 | ) 69 | 70 | else: 71 | import importlib 72 | import os 73 | import sys 74 | 75 | class _LazyModule(_BaseLazyModule): 76 | """ 77 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 78 | """ 79 | 80 | __file__ = globals()["__file__"] 81 | __path__ = [os.path.dirname(__file__)] 82 | 83 | def _get_module(self, module_name: str): 84 | return importlib.import_module("." + module_name, self.__name__) 85 | 86 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 87 | -------------------------------------------------------------------------------- /transformers/models/led/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2021 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | from typing import TYPE_CHECKING 19 | 20 | from ...file_utils import _BaseLazyModule, is_tf_available, is_tokenizers_available, is_torch_available 21 | 22 | 23 | _import_structure = { 24 | "configuration_led": ["LED_PRETRAINED_CONFIG_ARCHIVE_MAP", "LEDConfig"], 25 | "tokenization_led": ["LEDTokenizer"], 26 | } 27 | 28 | if is_tokenizers_available(): 29 | _import_structure["tokenization_led_fast"] = ["LEDTokenizerFast"] 30 | 31 | if is_torch_available(): 32 | _import_structure["modeling_led"] = [ 33 | "LED_PRETRAINED_MODEL_ARCHIVE_LIST", 34 | "LEDForConditionalGeneration", 35 | "LEDForQuestionAnswering", 36 | "LEDForSequenceClassification", 37 | "LEDModel", 38 | "LEDPreTrainedModel", 39 | ] 40 | 41 | 42 | if is_tf_available(): 43 | _import_structure["modeling_tf_led"] = ["TFLEDForConditionalGeneration", "TFLEDModel", "TFLEDPreTrainedModel"] 44 | 45 | 46 | if TYPE_CHECKING: 47 | from .configuration_led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig 48 | from .tokenization_led import LEDTokenizer 49 | 50 | if is_tokenizers_available(): 51 | from .tokenization_led_fast import LEDTokenizerFast 52 | 53 | if is_torch_available(): 54 | from .modeling_led import ( 55 | LED_PRETRAINED_MODEL_ARCHIVE_LIST, 56 | LEDForConditionalGeneration, 57 | LEDForQuestionAnswering, 58 | LEDForSequenceClassification, 59 | LEDModel, 60 | LEDPreTrainedModel, 61 | ) 62 | 63 | if is_tf_available(): 64 | from .modeling_tf_led import TFLEDForConditionalGeneration, TFLEDModel, TFLEDPreTrainedModel 65 | 66 | else: 67 | import importlib 68 | import os 69 | import sys 70 | 71 | class _LazyModule(_BaseLazyModule): 72 | """ 73 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 74 | """ 75 | 76 | __file__ = globals()["__file__"] 77 | __path__ = [os.path.dirname(__file__)] 78 | 79 | def _get_module(self, module_name: str): 80 | return importlib.import_module("." + module_name, self.__name__) 81 | 82 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 83 | -------------------------------------------------------------------------------- /transformers/models/roberta/configuration_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ RoBERTa configuration """ 17 | 18 | from ...utils import logging 19 | from ..bert.configuration_bert import BertConfig 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 25 | "roberta-base": "https://huggingface.co/roberta-base/resolve/main/config.json", 26 | "roberta-large": "https://huggingface.co/roberta-large/resolve/main/config.json", 27 | "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/config.json", 28 | "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/config.json", 29 | "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/config.json", 30 | "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/config.json", 31 | } 32 | 33 | 34 | class RobertaConfig(BertConfig): 35 | r""" 36 | This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel` or a 37 | :class:`~transformers.TFRobertaModel`. It is used to instantiate a RoBERTa model according to the specified 38 | arguments, defining the model architecture. 39 | 40 | 41 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model 42 | outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. 43 | 44 | The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the 45 | same defaults. Please check the parent class for more information. 46 | 47 | Examples:: 48 | 49 | >>> from transformers import RobertaConfig, RobertaModel 50 | 51 | >>> # Initializing a RoBERTa configuration 52 | >>> configuration = RobertaConfig() 53 | 54 | >>> # Initializing a model from the configuration 55 | >>> model = RobertaModel(configuration) 56 | 57 | >>> # Accessing the model configuration 58 | >>> configuration = model.config 59 | """ 60 | model_type = "roberta" 61 | 62 | def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs): 63 | """Constructs RobertaConfig.""" 64 | super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 65 | -------------------------------------------------------------------------------- /transformers/activations_tf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import math 16 | 17 | import tensorflow as tf 18 | from packaging import version 19 | 20 | 21 | def _gelu(x): 22 | """ 23 | Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when 24 | initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 25 | 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see 26 | https://arxiv.org/abs/1606.08415 27 | """ 28 | x = tf.convert_to_tensor(x) 29 | cdf = 0.5 * (1.0 + tf.math.erf(x / tf.cast(tf.sqrt(2.0), x.dtype))) 30 | 31 | return x * cdf 32 | 33 | 34 | def _gelu_new(x): 35 | """ 36 | Gaussian Error Linear Unit. This is a smoother version of the GELU. Original paper: https://arxiv.org/abs/1606.0841 37 | 38 | Args: 39 | x: float Tensor to perform activation 40 | 41 | Returns: 42 | `x` with the GELU activation applied. 43 | """ 44 | x = tf.convert_to_tensor(x) 45 | pi = tf.cast(math.pi, x.dtype) 46 | coeff = tf.cast(0.044715, x.dtype) 47 | cdf = 0.5 * (1.0 + tf.tanh(tf.sqrt(2.0 / pi) * (x + coeff * tf.pow(x, 3)))) 48 | 49 | return x * cdf 50 | 51 | 52 | def mish(x): 53 | x = tf.convert_to_tensor(x) 54 | 55 | return x * tf.tanh(tf.math.softplus(x)) 56 | 57 | 58 | def gelu_fast(x): 59 | x = tf.convert_to_tensor(x) 60 | coeff1 = tf.cast(0.7978845608, x.dtype) 61 | coeff2 = tf.cast(0.044715, x.dtype) 62 | 63 | return 0.5 * x * (1.0 + tf.tanh(x * coeff2 * (1.0 + coeff1 * x * x))) 64 | 65 | 66 | if version.parse(tf.version.VERSION) >= version.parse("2.4"): 67 | 68 | def approximate_gelu_wrap(x): 69 | return tf.keras.activations.gelu(x, approximate=True) 70 | 71 | gelu = tf.keras.activations.gelu 72 | gelu_new = approximate_gelu_wrap 73 | else: 74 | gelu = _gelu 75 | gelu_new = _gelu_new 76 | 77 | 78 | ACT2FN = { 79 | "gelu": gelu, 80 | "relu": tf.keras.activations.relu, 81 | "swish": tf.keras.activations.swish, 82 | "silu": tf.keras.activations.swish, 83 | "gelu_new": gelu_new, 84 | "mish": mish, 85 | "tanh": tf.keras.activations.tanh, 86 | "gelu_fast": gelu_fast, 87 | } 88 | 89 | 90 | def get_tf_activation(activation_string): 91 | if activation_string in ACT2FN: 92 | return ACT2FN[activation_string] 93 | else: 94 | raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys()))) 95 | -------------------------------------------------------------------------------- /transformers/models/reformer/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_sentencepiece_available, is_tokenizers_available, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_reformer": ["REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "ReformerConfig"], 26 | } 27 | 28 | if is_sentencepiece_available(): 29 | _import_structure["tokenization_reformer"] = ["ReformerTokenizer"] 30 | 31 | if is_tokenizers_available(): 32 | _import_structure["tokenization_reformer_fast"] = ["ReformerTokenizerFast"] 33 | 34 | if is_torch_available(): 35 | _import_structure["modeling_reformer"] = [ 36 | "REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", 37 | "ReformerAttention", 38 | "ReformerForMaskedLM", 39 | "ReformerForQuestionAnswering", 40 | "ReformerForSequenceClassification", 41 | "ReformerLayer", 42 | "ReformerModel", 43 | "ReformerModelWithLMHead", 44 | ] 45 | 46 | 47 | if TYPE_CHECKING: 48 | from .configuration_reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig 49 | 50 | if is_sentencepiece_available(): 51 | from .tokenization_reformer import ReformerTokenizer 52 | 53 | if is_tokenizers_available(): 54 | from .tokenization_reformer_fast import ReformerTokenizerFast 55 | 56 | if is_torch_available(): 57 | from .modeling_reformer import ( 58 | REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, 59 | ReformerAttention, 60 | ReformerForMaskedLM, 61 | ReformerForQuestionAnswering, 62 | ReformerForSequenceClassification, 63 | ReformerLayer, 64 | ReformerModel, 65 | ReformerModelWithLMHead, 66 | ) 67 | 68 | else: 69 | import importlib 70 | import os 71 | import sys 72 | 73 | class _LazyModule(_BaseLazyModule): 74 | """ 75 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 76 | """ 77 | 78 | __file__ = globals()["__file__"] 79 | __path__ = [os.path.dirname(__file__)] 80 | 81 | def _get_module(self, module_name: str): 82 | return importlib.import_module("." + module_name, self.__name__) 83 | 84 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 85 | -------------------------------------------------------------------------------- /transformers/models/electra/tokenization_electra.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from ..bert.tokenization_bert import BertTokenizer 17 | 18 | 19 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 20 | 21 | PRETRAINED_VOCAB_FILES_MAP = { 22 | "vocab_file": { 23 | "google/electra-small-generator": "https://huggingface.co/google/electra-small-generator/resolve/main/vocab.txt", 24 | "google/electra-base-generator": "https://huggingface.co/google/electra-base-generator/resolve/main/vocab.txt", 25 | "google/electra-large-generator": "https://huggingface.co/google/electra-large-generator/resolve/main/vocab.txt", 26 | "google/electra-small-discriminator": "https://huggingface.co/google/electra-small-discriminator/resolve/main/vocab.txt", 27 | "google/electra-base-discriminator": "https://huggingface.co/google/electra-base-discriminator/resolve/main/vocab.txt", 28 | "google/electra-large-discriminator": "https://huggingface.co/google/electra-large-discriminator/resolve/main/vocab.txt", 29 | } 30 | } 31 | 32 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 33 | "google/electra-small-generator": 512, 34 | "google/electra-base-generator": 512, 35 | "google/electra-large-generator": 512, 36 | "google/electra-small-discriminator": 512, 37 | "google/electra-base-discriminator": 512, 38 | "google/electra-large-discriminator": 512, 39 | } 40 | 41 | 42 | PRETRAINED_INIT_CONFIGURATION = { 43 | "google/electra-small-generator": {"do_lower_case": True}, 44 | "google/electra-base-generator": {"do_lower_case": True}, 45 | "google/electra-large-generator": {"do_lower_case": True}, 46 | "google/electra-small-discriminator": {"do_lower_case": True}, 47 | "google/electra-base-discriminator": {"do_lower_case": True}, 48 | "google/electra-large-discriminator": {"do_lower_case": True}, 49 | } 50 | 51 | 52 | class ElectraTokenizer(BertTokenizer): 53 | r""" 54 | Construct an ELECTRA tokenizer. 55 | 56 | :class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 57 | tokenization: punctuation splitting and wordpiece. 58 | 59 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 60 | parameters. 61 | """ 62 | 63 | vocab_files_names = VOCAB_FILES_NAMES 64 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 65 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 66 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 67 | -------------------------------------------------------------------------------- /transformers/models/pegasus/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | from typing import TYPE_CHECKING 19 | 20 | from ...file_utils import ( 21 | _BaseLazyModule, 22 | is_sentencepiece_available, 23 | is_tf_available, 24 | is_tokenizers_available, 25 | is_torch_available, 26 | ) 27 | 28 | 29 | _import_structure = { 30 | "configuration_pegasus": ["PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP", "PegasusConfig"], 31 | } 32 | 33 | if is_sentencepiece_available(): 34 | _import_structure["tokenization_pegasus"] = ["PegasusTokenizer"] 35 | 36 | if is_tokenizers_available(): 37 | _import_structure["tokenization_pegasus_fast"] = ["PegasusTokenizerFast"] 38 | 39 | if is_torch_available(): 40 | _import_structure["modeling_pegasus"] = [ 41 | "PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST", 42 | "PegasusForConditionalGeneration", 43 | "PegasusModel", 44 | "PegasusPreTrainedModel", 45 | ] 46 | 47 | if is_tf_available(): 48 | _import_structure["modeling_tf_pegasus"] = ["TFPegasusForConditionalGeneration", "TFPegasusModel"] 49 | 50 | 51 | if TYPE_CHECKING: 52 | from .configuration_pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig 53 | 54 | if is_sentencepiece_available(): 55 | from .tokenization_pegasus import PegasusTokenizer 56 | 57 | if is_tokenizers_available(): 58 | from .tokenization_pegasus_fast import PegasusTokenizerFast 59 | 60 | if is_torch_available(): 61 | from .modeling_pegasus import ( 62 | PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST, 63 | PegasusForConditionalGeneration, 64 | PegasusModel, 65 | PegasusPreTrainedModel, 66 | ) 67 | 68 | if is_tf_available(): 69 | from .modeling_tf_pegasus import TFPegasusForConditionalGeneration, TFPegasusModel 70 | 71 | else: 72 | import importlib 73 | import os 74 | import sys 75 | 76 | class _LazyModule(_BaseLazyModule): 77 | """ 78 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 79 | """ 80 | 81 | __file__ = globals()["__file__"] 82 | __path__ = [os.path.dirname(__file__)] 83 | 84 | def _get_module(self, module_name: str): 85 | return importlib.import_module("." + module_name, self.__name__) 86 | 87 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 88 | -------------------------------------------------------------------------------- /transformers/models/bart/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | from typing import TYPE_CHECKING 19 | 20 | from ...file_utils import _BaseLazyModule, is_tf_available, is_tokenizers_available, is_torch_available 21 | 22 | 23 | _import_structure = { 24 | "configuration_bart": ["BART_PRETRAINED_CONFIG_ARCHIVE_MAP", "BartConfig"], 25 | "tokenization_bart": ["BartTokenizer"], 26 | } 27 | 28 | if is_tokenizers_available(): 29 | _import_structure["tokenization_bart_fast"] = ["BartTokenizerFast"] 30 | 31 | if is_torch_available(): 32 | _import_structure["modeling_bart"] = [ 33 | "BART_PRETRAINED_MODEL_ARCHIVE_LIST", 34 | "BartForConditionalGeneration", 35 | "BartForQuestionAnswering", 36 | "BartForSequenceClassification", 37 | "BartModel", 38 | "BartPretrainedModel", 39 | "PretrainedBartModel", 40 | ] 41 | 42 | if is_tf_available(): 43 | _import_structure["modeling_tf_bart"] = ["TFBartForConditionalGeneration", "TFBartModel", "TFBartPretrainedModel"] 44 | 45 | 46 | if TYPE_CHECKING: 47 | from .configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartConfig 48 | from .tokenization_bart import BartTokenizer 49 | 50 | if is_tokenizers_available(): 51 | from .tokenization_bart_fast import BartTokenizerFast 52 | 53 | if is_torch_available(): 54 | from .modeling_bart import ( 55 | BART_PRETRAINED_MODEL_ARCHIVE_LIST, 56 | BartForConditionalGeneration, 57 | BartForQuestionAnswering, 58 | BartForSequenceClassification, 59 | BartModel, 60 | BartPretrainedModel, 61 | PretrainedBartModel, 62 | ) 63 | 64 | if is_tf_available(): 65 | from .modeling_tf_bart import TFBartForConditionalGeneration, TFBartModel, TFBartPretrainedModel 66 | 67 | else: 68 | import importlib 69 | import os 70 | import sys 71 | 72 | class _LazyModule(_BaseLazyModule): 73 | """ 74 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 75 | """ 76 | 77 | __file__ = globals()["__file__"] 78 | __path__ = [os.path.dirname(__file__)] 79 | 80 | def _get_module(self, module_name: str): 81 | return importlib.import_module("." + module_name, self.__name__) 82 | 83 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 84 | -------------------------------------------------------------------------------- /transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert ELECTRA checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import torch 21 | 22 | from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra 23 | from transformers.utils import logging 24 | 25 | 26 | logging.set_verbosity_info() 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator): 30 | # Initialise PyTorch model 31 | config = ElectraConfig.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | 34 | if discriminator_or_generator == "discriminator": 35 | model = ElectraForPreTraining(config) 36 | elif discriminator_or_generator == "generator": 37 | model = ElectraForMaskedLM(config) 38 | else: 39 | raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'") 40 | 41 | # Load weights from tf checkpoint 42 | load_tf_weights_in_electra( 43 | model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator 44 | ) 45 | 46 | # Save pytorch-model 47 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 48 | torch.save(model.state_dict(), pytorch_dump_path) 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | # Required parameters 54 | parser.add_argument( 55 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 56 | ) 57 | parser.add_argument( 58 | "--config_file", 59 | default=None, 60 | type=str, 61 | required=True, 62 | help="The config json file corresponding to the pre-trained model. \n" 63 | "This specifies the model architecture.", 64 | ) 65 | parser.add_argument( 66 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 67 | ) 68 | parser.add_argument( 69 | "--discriminator_or_generator", 70 | default=None, 71 | type=str, 72 | required=True, 73 | help="Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or " 74 | "'generator'.", 75 | ) 76 | args = parser.parse_args() 77 | convert_tf_checkpoint_to_pytorch( 78 | args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator 79 | ) 80 | -------------------------------------------------------------------------------- /transformers/models/lxmert/tokenization_lxmert_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from ..bert.tokenization_bert_fast import BertTokenizerFast 17 | from .tokenization_lxmert import LxmertTokenizer 18 | 19 | 20 | #################################################### 21 | # Mapping from the keyword arguments names of Tokenizer `__init__` 22 | # to file names for serializing Tokenizer instances 23 | #################################################### 24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} 25 | 26 | #################################################### 27 | # Mapping from the keyword arguments names of Tokenizer `__init__` 28 | # to pretrained vocabulary URL for all the model ids. 29 | #################################################### 30 | PRETRAINED_VOCAB_FILES_MAP = { 31 | "vocab_file": { 32 | "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", 33 | }, 34 | "tokenizer_file": { 35 | "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", 36 | }, 37 | } 38 | 39 | #################################################### 40 | # Mapping from model ids to max length of inputs 41 | #################################################### 42 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 43 | "unc-nlp/lxmert-base-uncased": 512, 44 | } 45 | #################################################### 46 | # Mapping from model ids to a dictionary of additional 47 | # keyword arguments for Tokenizer `__init__`. 48 | # To be used for checkpoint specific configurations. 49 | #################################################### 50 | PRETRAINED_INIT_CONFIGURATION = { 51 | "unc-nlp/lxmert-base-uncased": {"do_lower_case": True}, 52 | } 53 | 54 | 55 | class LxmertTokenizerFast(BertTokenizerFast): 56 | r""" 57 | Construct a "fast" LXMERT tokenizer (backed by HuggingFace's `tokenizers` library). 58 | 59 | :class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs 60 | end-to-end tokenization: punctuation splitting and wordpiece. 61 | 62 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning 63 | parameters. 64 | """ 65 | vocab_files_names = VOCAB_FILES_NAMES 66 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 67 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 68 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 69 | slow_tokenizer_class = LxmertTokenizer 70 | -------------------------------------------------------------------------------- /transformers/models/herbert/tokenization_herbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from ...utils import logging 17 | from ..bert.tokenization_bert import BasicTokenizer 18 | from ..xlm.tokenization_xlm import XLMTokenizer 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | VOCAB_FILES_NAMES = { 24 | "vocab_file": "vocab.json", 25 | "merges_file": "merges.txt", 26 | } 27 | 28 | PRETRAINED_VOCAB_FILES_MAP = { 29 | "vocab_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/vocab.json"}, 30 | "merges_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/merges.txt"}, 31 | } 32 | 33 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514} 34 | PRETRAINED_INIT_CONFIGURATION = {} 35 | 36 | 37 | class HerbertTokenizer(XLMTokenizer): 38 | """ 39 | Construct a BPE tokenizer for HerBERT. 40 | 41 | Peculiarities: 42 | 43 | - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a 44 | punctuation character will be treated separately. 45 | 46 | - Such pretokenized input is BPE subtokenized 47 | 48 | This tokenizer inherits from :class:`~transformers.XLMTokenizer` which contains most of the methods. Users should 49 | refer to the superclass for more information regarding methods. 50 | """ 51 | 52 | vocab_files_names = VOCAB_FILES_NAMES 53 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 54 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 55 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 56 | 57 | def __init__(self, **kwargs): 58 | 59 | kwargs["cls_token"] = "" 60 | kwargs["unk_token"] = "" 61 | kwargs["pad_token"] = "" 62 | kwargs["mask_token"] = "" 63 | kwargs["sep_token"] = "" 64 | kwargs["do_lowercase_and_remove_accent"] = False 65 | kwargs["additional_special_tokens"] = [] 66 | 67 | super().__init__(**kwargs) 68 | self.bert_pre_tokenizer = BasicTokenizer( 69 | do_lower_case=False, never_split=self.all_special_tokens, tokenize_chinese_chars=False, strip_accents=False 70 | ) 71 | 72 | def _tokenize(self, text): 73 | 74 | pre_tokens = self.bert_pre_tokenizer.tokenize(text) 75 | 76 | split_tokens = [] 77 | for token in pre_tokens: 78 | if token: 79 | split_tokens.extend([t for t in self.bpe(token).split(" ")]) 80 | 81 | return split_tokens 82 | -------------------------------------------------------------------------------- /transformers/models/distilbert/tokenization_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for DistilBERT.""" 16 | 17 | from ...utils import logging 18 | from ..bert.tokenization_bert import BertTokenizer 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 24 | 25 | PRETRAINED_VOCAB_FILES_MAP = { 26 | "vocab_file": { 27 | "distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", 28 | "distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", 29 | "distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt", 30 | "distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt", 31 | "distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt", 32 | "distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt", 33 | } 34 | } 35 | 36 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 37 | "distilbert-base-uncased": 512, 38 | "distilbert-base-uncased-distilled-squad": 512, 39 | "distilbert-base-cased": 512, 40 | "distilbert-base-cased-distilled-squad": 512, 41 | "distilbert-base-german-cased": 512, 42 | "distilbert-base-multilingual-cased": 512, 43 | } 44 | 45 | 46 | PRETRAINED_INIT_CONFIGURATION = { 47 | "distilbert-base-uncased": {"do_lower_case": True}, 48 | "distilbert-base-uncased-distilled-squad": {"do_lower_case": True}, 49 | "distilbert-base-cased": {"do_lower_case": False}, 50 | "distilbert-base-cased-distilled-squad": {"do_lower_case": False}, 51 | "distilbert-base-german-cased": {"do_lower_case": False}, 52 | "distilbert-base-multilingual-cased": {"do_lower_case": False}, 53 | } 54 | 55 | 56 | class DistilBertTokenizer(BertTokenizer): 57 | r""" 58 | Construct a DistilBERT tokenizer. 59 | 60 | :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 61 | tokenization: punctuation splitting and wordpiece. 62 | 63 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 64 | parameters. 65 | """ 66 | 67 | vocab_files_names = VOCAB_FILES_NAMES 68 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 69 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 70 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 71 | model_input_names = ["attention_mask"] 72 | -------------------------------------------------------------------------------- /transformers/models/squeezebert/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_tokenizers_available, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_squeezebert": ["SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "SqueezeBertConfig"], 26 | "tokenization_squeezebert": ["SqueezeBertTokenizer"], 27 | } 28 | 29 | if is_tokenizers_available(): 30 | _import_structure["tokenization_squeezebert_fast"] = ["SqueezeBertTokenizerFast"] 31 | 32 | if is_torch_available(): 33 | _import_structure["modeling_squeezebert"] = [ 34 | "SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST", 35 | "SqueezeBertForMaskedLM", 36 | "SqueezeBertForMultipleChoice", 37 | "SqueezeBertForQuestionAnswering", 38 | "SqueezeBertForSequenceClassification", 39 | "SqueezeBertForTokenClassification", 40 | "SqueezeBertModel", 41 | "SqueezeBertModule", 42 | "SqueezeBertPreTrainedModel", 43 | ] 44 | 45 | 46 | if TYPE_CHECKING: 47 | from .configuration_squeezebert import SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, SqueezeBertConfig 48 | from .tokenization_squeezebert import SqueezeBertTokenizer 49 | 50 | if is_tokenizers_available(): 51 | from .tokenization_squeezebert_fast import SqueezeBertTokenizerFast 52 | 53 | if is_torch_available(): 54 | from .modeling_squeezebert import ( 55 | SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST, 56 | SqueezeBertForMaskedLM, 57 | SqueezeBertForMultipleChoice, 58 | SqueezeBertForQuestionAnswering, 59 | SqueezeBertForSequenceClassification, 60 | SqueezeBertForTokenClassification, 61 | SqueezeBertModel, 62 | SqueezeBertModule, 63 | SqueezeBertPreTrainedModel, 64 | ) 65 | 66 | else: 67 | import importlib 68 | import os 69 | import sys 70 | 71 | class _LazyModule(_BaseLazyModule): 72 | """ 73 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 74 | """ 75 | 76 | __file__ = globals()["__file__"] 77 | __path__ = [os.path.dirname(__file__)] 78 | 79 | def _get_module(self, module_name: str): 80 | return importlib.import_module("." + module_name, self.__name__) 81 | 82 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 83 | -------------------------------------------------------------------------------- /transformers/models/squeezebert/tokenization_squeezebert_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for SqueezeBERT.""" 16 | 17 | from ...utils import logging 18 | from ..bert.tokenization_bert_fast import BertTokenizerFast 19 | from .tokenization_squeezebert import SqueezeBertTokenizer 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} 25 | 26 | PRETRAINED_VOCAB_FILES_MAP = { 27 | "vocab_file": { 28 | "squeezebert/squeezebert-uncased": "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/vocab.txt", 29 | "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/vocab.txt", 30 | "squeezebert/squeezebert-mnli-headless": "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/vocab.txt", 31 | }, 32 | "tokenizer_file": { 33 | "squeezebert/squeezebert-uncased": "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/tokenizer.json", 34 | "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/tokenizer.json", 35 | "squeezebert/squeezebert-mnli-headless": "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/tokenizer.json", 36 | }, 37 | } 38 | 39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 40 | "squeezebert/squeezebert-uncased": 512, 41 | "squeezebert/squeezebert-mnli": 512, 42 | "squeezebert/squeezebert-mnli-headless": 512, 43 | } 44 | 45 | 46 | PRETRAINED_INIT_CONFIGURATION = { 47 | "squeezebert/squeezebert-uncased": {"do_lower_case": True}, 48 | "squeezebert/squeezebert-mnli": {"do_lower_case": True}, 49 | "squeezebert/squeezebert-mnli-headless": {"do_lower_case": True}, 50 | } 51 | 52 | 53 | class SqueezeBertTokenizerFast(BertTokenizerFast): 54 | r""" 55 | Constructs a "Fast" SqueezeBert tokenizer (backed by HuggingFace's `tokenizers` library). 56 | 57 | :class:`~transformers.SqueezeBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs 58 | end-to-end tokenization: punctuation splitting + wordpiece. 59 | 60 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning 61 | parameters. 62 | """ 63 | 64 | vocab_files_names = VOCAB_FILES_NAMES 65 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 66 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 67 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 68 | slow_tokenizer_class = SqueezeBertTokenizer 69 | -------------------------------------------------------------------------------- /transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert Seq2Seq TF Hub checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | from . import ( 21 | BertConfig, 22 | BertGenerationConfig, 23 | BertGenerationDecoder, 24 | BertGenerationEncoder, 25 | load_tf_weights_in_bert_generation, 26 | logging, 27 | ) 28 | 29 | 30 | logging.set_verbosity_info() 31 | 32 | 33 | def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder): 34 | # Initialise PyTorch model 35 | bert_config = BertConfig.from_pretrained( 36 | "bert-large-cased", 37 | vocab_size=vocab_size, 38 | max_position_embeddings=512, 39 | is_decoder=True, 40 | add_cross_attention=True, 41 | ) 42 | bert_config_dict = bert_config.to_dict() 43 | del bert_config_dict["type_vocab_size"] 44 | config = BertGenerationConfig(**bert_config_dict) 45 | if is_encoder: 46 | model = BertGenerationEncoder(config) 47 | else: 48 | model = BertGenerationDecoder(config) 49 | print("Building PyTorch model from configuration: {}".format(str(config))) 50 | 51 | # Load weights from tf checkpoint 52 | load_tf_weights_in_bert_generation( 53 | model, 54 | tf_hub_path, 55 | model_class="bert", 56 | is_encoder_named_decoder=is_encoder_named_decoder, 57 | is_encoder=is_encoder, 58 | ) 59 | 60 | # Save pytorch-model 61 | print("Save PyTorch model and config to {}".format(pytorch_dump_path)) 62 | model.save_pretrained(pytorch_dump_path) 63 | 64 | 65 | if __name__ == "__main__": 66 | parser = argparse.ArgumentParser() 67 | # Required parameters 68 | parser.add_argument( 69 | "--tf_hub_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 70 | ) 71 | parser.add_argument( 72 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 73 | ) 74 | parser.add_argument( 75 | "--is_encoder_named_decoder", 76 | action="store_true", 77 | help="If decoder has to be renamed to encoder in PyTorch model.", 78 | ) 79 | parser.add_argument("--is_encoder", action="store_true", help="If model is an encoder.") 80 | parser.add_argument("--vocab_size", default=50358, type=int, help="Vocab size of model") 81 | args = parser.parse_args() 82 | convert_tf_checkpoint_to_pytorch( 83 | args.tf_hub_path, 84 | args.pytorch_dump_path, 85 | args.is_encoder_named_decoder, 86 | args.vocab_size, 87 | is_encoder=args.is_encoder, 88 | ) 89 | -------------------------------------------------------------------------------- /transformers/models/mt5/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import ( 22 | _BaseLazyModule, 23 | is_sentencepiece_available, 24 | is_tf_available, 25 | is_tokenizers_available, 26 | is_torch_available, 27 | ) 28 | 29 | 30 | if is_sentencepiece_available(): 31 | from ..t5.tokenization_t5 import T5Tokenizer 32 | 33 | MT5Tokenizer = T5Tokenizer 34 | 35 | if is_tokenizers_available(): 36 | from ..t5.tokenization_t5_fast import T5TokenizerFast 37 | 38 | MT5TokenizerFast = T5TokenizerFast 39 | 40 | _import_structure = { 41 | "configuration_mt5": ["MT5Config"], 42 | } 43 | 44 | if is_torch_available(): 45 | _import_structure["modeling_mt5"] = ["MT5EncoderModel", "MT5ForConditionalGeneration", "MT5Model"] 46 | 47 | if is_tf_available(): 48 | _import_structure["modeling_tf_mt5"] = ["TFMT5EncoderModel", "TFMT5ForConditionalGeneration", "TFMT5Model"] 49 | 50 | 51 | if TYPE_CHECKING: 52 | from .configuration_mt5 import MT5Config 53 | 54 | if is_sentencepiece_available(): 55 | from ..t5.tokenization_t5 import T5Tokenizer 56 | 57 | MT5Tokenizer = T5Tokenizer 58 | 59 | if is_tokenizers_available(): 60 | from ..t5.tokenization_t5_fast import T5TokenizerFast 61 | 62 | MT5TokenizerFast = T5TokenizerFast 63 | 64 | if is_torch_available(): 65 | from .modeling_mt5 import MT5EncoderModel, MT5ForConditionalGeneration, MT5Model 66 | 67 | if is_tf_available(): 68 | from .modeling_tf_mt5 import TFMT5EncoderModel, TFMT5ForConditionalGeneration, TFMT5Model 69 | 70 | else: 71 | import importlib 72 | import os 73 | import sys 74 | 75 | class _LazyModule(_BaseLazyModule): 76 | """ 77 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 78 | """ 79 | 80 | __file__ = globals()["__file__"] 81 | __path__ = [os.path.dirname(__file__)] 82 | 83 | def _get_module(self, module_name: str): 84 | return importlib.import_module("." + module_name, self.__name__) 85 | 86 | def __getattr__(self, name): 87 | if name == "MT5Tokenizer": 88 | return MT5Tokenizer 89 | elif name == name == "MT5TokenizerFast": 90 | return MT5TokenizerFast 91 | else: 92 | return super().__getattr__(name) 93 | 94 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 95 | -------------------------------------------------------------------------------- /transformers/models/mbart/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | from typing import TYPE_CHECKING 19 | 20 | from ...file_utils import ( 21 | _BaseLazyModule, 22 | is_sentencepiece_available, 23 | is_tf_available, 24 | is_tokenizers_available, 25 | is_torch_available, 26 | ) 27 | 28 | 29 | _import_structure = { 30 | "configuration_mbart": ["MBART_PRETRAINED_CONFIG_ARCHIVE_MAP", "MBartConfig"], 31 | } 32 | 33 | if is_sentencepiece_available(): 34 | _import_structure["tokenization_mbart"] = ["MBartTokenizer"] 35 | 36 | if is_tokenizers_available(): 37 | _import_structure["tokenization_mbart_fast"] = ["MBartTokenizerFast"] 38 | 39 | if is_torch_available(): 40 | _import_structure["modeling_mbart"] = [ 41 | "MBART_PRETRAINED_MODEL_ARCHIVE_LIST", 42 | "MBartForConditionalGeneration", 43 | "MBartForQuestionAnswering", 44 | "MBartForSequenceClassification", 45 | "MBartModel", 46 | "MBartPreTrainedModel", 47 | ] 48 | 49 | if is_tf_available(): 50 | _import_structure["modeling_tf_mbart"] = ["TFMBartForConditionalGeneration", "TFMBartModel"] 51 | 52 | 53 | if TYPE_CHECKING: 54 | from .configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig 55 | 56 | if is_sentencepiece_available(): 57 | from .tokenization_mbart import MBartTokenizer 58 | 59 | if is_tokenizers_available(): 60 | from .tokenization_mbart_fast import MBartTokenizerFast 61 | 62 | if is_torch_available(): 63 | from .modeling_mbart import ( 64 | MBART_PRETRAINED_MODEL_ARCHIVE_LIST, 65 | MBartForConditionalGeneration, 66 | MBartForQuestionAnswering, 67 | MBartForSequenceClassification, 68 | MBartModel, 69 | MBartPreTrainedModel, 70 | ) 71 | 72 | if is_tf_available(): 73 | from .modeling_tf_mbart import TFMBartForConditionalGeneration, TFMBartModel 74 | 75 | else: 76 | import importlib 77 | import os 78 | import sys 79 | 80 | class _LazyModule(_BaseLazyModule): 81 | """ 82 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 83 | """ 84 | 85 | __file__ = globals()["__file__"] 86 | __path__ = [os.path.dirname(__file__)] 87 | 88 | def _get_module(self, module_name: str): 89 | return importlib.import_module("." + module_name, self.__name__) 90 | 91 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 92 | -------------------------------------------------------------------------------- /transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import json 20 | 21 | import numpy 22 | import torch 23 | 24 | from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME 25 | from transformers.models.xlm.tokenization_xlm import VOCAB_FILES_NAMES 26 | from transformers.utils import logging 27 | 28 | 29 | logging.set_verbosity_info() 30 | 31 | 32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path): 33 | # Load checkpoint 34 | chkpt = torch.load(xlm_checkpoint_path, map_location="cpu") 35 | 36 | state_dict = chkpt["model"] 37 | 38 | # We have the base model one level deeper than the original XLM repository 39 | two_levels_state_dict = {} 40 | for k, v in state_dict.items(): 41 | if "pred_layer" in k: 42 | two_levels_state_dict[k] = v 43 | else: 44 | two_levels_state_dict["transformer." + k] = v 45 | 46 | config = chkpt["params"] 47 | config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))) 48 | 49 | vocab = chkpt["dico_word2id"] 50 | vocab = dict((s + "" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items()) 51 | 52 | # Save pytorch-model 53 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 54 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 55 | pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"] 56 | 57 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 58 | torch.save(two_levels_state_dict, pytorch_weights_dump_path) 59 | 60 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 61 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 62 | f.write(json.dumps(config, indent=2) + "\n") 63 | 64 | print("Save vocab file to {}".format(pytorch_config_dump_path)) 65 | with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f: 66 | f.write(json.dumps(vocab, indent=2) + "\n") 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser() 71 | # Required parameters 72 | parser.add_argument( 73 | "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." 74 | ) 75 | parser.add_argument( 76 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 77 | ) 78 | args = parser.parse_args() 79 | convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path) 80 | -------------------------------------------------------------------------------- /transformers/models/openai/tokenization_openai_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Fast Tokenization classes for OpenAI GPT.""" 16 | 17 | 18 | from typing import Optional, Tuple 19 | 20 | from ...tokenization_utils_fast import PreTrainedTokenizerFast 21 | from ...utils import logging 22 | from .tokenization_openai import OpenAIGPTTokenizer 23 | 24 | 25 | logger = logging.get_logger(__name__) 26 | 27 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} 28 | 29 | PRETRAINED_VOCAB_FILES_MAP = { 30 | "vocab_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/vocab.json"}, 31 | "merges_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/merges.txt"}, 32 | "tokenizer_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/tokenizer.json"}, 33 | } 34 | 35 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 36 | "openai-gpt": 512, 37 | } 38 | 39 | 40 | class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast): 41 | """ 42 | Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with 43 | the following peculiarities: 44 | 45 | - lower case all inputs 46 | - uses BERT's BasicTokenizer for pre-BPE tokenization 47 | 48 | This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main 49 | methods. Users should refer to this superclass for more information regarding those methods. 50 | 51 | Args: 52 | vocab_file (:obj:`str`): 53 | Path to the vocabulary file. 54 | merges_file (:obj:`str`): 55 | Path to the merges file. 56 | unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): 57 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this 58 | token instead. 59 | """ 60 | 61 | vocab_files_names = VOCAB_FILES_NAMES 62 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 63 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 64 | model_input_names = ["attention_mask"] 65 | slow_tokenizer_class = OpenAIGPTTokenizer 66 | 67 | def __init__(self, vocab_file, merges_file, tokenizer_file=None, unk_token="", **kwargs): 68 | super().__init__(vocab_file, merges_file, tokenizer_file=tokenizer_file, unk_token=unk_token, **kwargs) 69 | 70 | @property 71 | def do_lower_case(self): 72 | return True 73 | 74 | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: 75 | files = self._tokenizer.model.save(save_directory, name=filename_prefix) 76 | return tuple(files) 77 | -------------------------------------------------------------------------------- /transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert RoBERTa checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import pytorch_lightning as pl 21 | import torch 22 | 23 | from transformers import LongformerForQuestionAnswering, LongformerModel 24 | 25 | 26 | class LightningModel(pl.LightningModule): 27 | def __init__(self, model): 28 | super().__init__() 29 | self.model = model 30 | self.num_labels = 2 31 | self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels) 32 | 33 | # implement only because lightning requires to do so 34 | def forward(self): 35 | pass 36 | 37 | 38 | def convert_longformer_qa_checkpoint_to_pytorch( 39 | longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str 40 | ): 41 | 42 | # load longformer model from model identifier 43 | longformer = LongformerModel.from_pretrained(longformer_model) 44 | lightning_model = LightningModel(longformer) 45 | 46 | ckpt = torch.load(longformer_question_answering_ckpt_path, map_location=torch.device("cpu")) 47 | lightning_model.load_state_dict(ckpt["state_dict"]) 48 | 49 | # init longformer question answering model 50 | longformer_for_qa = LongformerForQuestionAnswering.from_pretrained(longformer_model) 51 | 52 | # transfer weights 53 | longformer_for_qa.longformer.load_state_dict(lightning_model.model.state_dict()) 54 | longformer_for_qa.qa_outputs.load_state_dict(lightning_model.qa_outputs.state_dict()) 55 | longformer_for_qa.eval() 56 | 57 | # save model 58 | longformer_for_qa.save_pretrained(pytorch_dump_folder_path) 59 | 60 | print("Conversion successful. Model saved under {}".format(pytorch_dump_folder_path)) 61 | 62 | 63 | if __name__ == "__main__": 64 | parser = argparse.ArgumentParser() 65 | # Required parameters 66 | parser.add_argument( 67 | "--longformer_model", 68 | default=None, 69 | type=str, 70 | required=True, 71 | help="model identifier of longformer. Should be either `longformer-base-4096` or `longformer-large-4096`.", 72 | ) 73 | parser.add_argument( 74 | "--longformer_question_answering_ckpt_path", 75 | default=None, 76 | type=str, 77 | required=True, 78 | help="Path the official PyTorch Lightning Checkpoint.", 79 | ) 80 | parser.add_argument( 81 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 82 | ) 83 | args = parser.parse_args() 84 | convert_longformer_qa_checkpoint_to_pytorch( 85 | args.longformer_model, args.longformer_question_answering_ckpt_path, args.pytorch_dump_folder_path 86 | ) 87 | -------------------------------------------------------------------------------- /transformers/models/rag/tokenization_rag.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020, The RAG Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for RAG.""" 16 | import os 17 | from typing import List, Optional 18 | 19 | from ...tokenization_utils_base import BatchEncoding 20 | from ...utils import logging 21 | from .configuration_rag import RagConfig 22 | 23 | 24 | logger = logging.get_logger(__name__) 25 | 26 | 27 | class RagTokenizer: 28 | def __init__(self, question_encoder, generator): 29 | self.question_encoder = question_encoder 30 | self.generator = generator 31 | 32 | def save_pretrained(self, save_directory): 33 | if os.path.isfile(save_directory): 34 | raise ValueError("Provided path ({}) should be a directory, not a file".format(save_directory)) 35 | os.makedirs(save_directory, exist_ok=True) 36 | question_encoder_path = os.path.join(save_directory, "question_encoder_tokenizer") 37 | generator_path = os.path.join(save_directory, "generator_tokenizer") 38 | self.question_encoder.save_pretrained(question_encoder_path) 39 | self.generator.save_pretrained(generator_path) 40 | 41 | @classmethod 42 | def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): 43 | # dynamically import AutoTokenizer 44 | from ..auto.tokenization_auto import AutoTokenizer 45 | 46 | config = kwargs.pop("config", None) 47 | 48 | if config is None: 49 | config = RagConfig.from_pretrained(pretrained_model_name_or_path) 50 | 51 | question_encoder = AutoTokenizer.from_pretrained( 52 | pretrained_model_name_or_path, config=config.question_encoder, subfolder="question_encoder_tokenizer" 53 | ) 54 | generator = AutoTokenizer.from_pretrained( 55 | pretrained_model_name_or_path, config=config.generator, subfolder="generator_tokenizer" 56 | ) 57 | return cls(question_encoder=question_encoder, generator=generator) 58 | 59 | def __call__(self, *args, **kwargs): 60 | return self.question_encoder(*args, **kwargs) 61 | 62 | def batch_decode(self, *args, **kwargs): 63 | return self.generator.batch_decode(*args, **kwargs) 64 | 65 | def prepare_seq2seq_batch( 66 | self, 67 | src_texts: List[str], 68 | tgt_texts: Optional[List[str]] = None, 69 | max_length: Optional[int] = None, 70 | max_target_length: Optional[int] = None, 71 | **kwargs, 72 | ) -> BatchEncoding: 73 | if max_length is None: 74 | max_length = self.question_encoder.model_max_length 75 | if max_target_length is None: 76 | max_target_length = self.generator.model_max_length 77 | return super().prepare_seq2seq_batch( 78 | src_texts, tgt_texts, max_length=max_length, max_target_length=max_target_length, **kwargs 79 | ) 80 | -------------------------------------------------------------------------------- /transformers/utils/dummy_sentencepiece_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..file_utils import requires_sentencepiece 3 | 4 | 5 | class AlbertTokenizer: 6 | def __init__(self, *args, **kwargs): 7 | requires_sentencepiece(self) 8 | 9 | @classmethod 10 | def from_pretrained(self, *args, **kwargs): 11 | requires_sentencepiece(self) 12 | 13 | 14 | class BarthezTokenizer: 15 | def __init__(self, *args, **kwargs): 16 | requires_sentencepiece(self) 17 | 18 | @classmethod 19 | def from_pretrained(self, *args, **kwargs): 20 | requires_sentencepiece(self) 21 | 22 | 23 | class BertGenerationTokenizer: 24 | def __init__(self, *args, **kwargs): 25 | requires_sentencepiece(self) 26 | 27 | @classmethod 28 | def from_pretrained(self, *args, **kwargs): 29 | requires_sentencepiece(self) 30 | 31 | 32 | class CamembertTokenizer: 33 | def __init__(self, *args, **kwargs): 34 | requires_sentencepiece(self) 35 | 36 | @classmethod 37 | def from_pretrained(self, *args, **kwargs): 38 | requires_sentencepiece(self) 39 | 40 | 41 | class MarianTokenizer: 42 | def __init__(self, *args, **kwargs): 43 | requires_sentencepiece(self) 44 | 45 | @classmethod 46 | def from_pretrained(self, *args, **kwargs): 47 | requires_sentencepiece(self) 48 | 49 | 50 | class MBartTokenizer: 51 | def __init__(self, *args, **kwargs): 52 | requires_sentencepiece(self) 53 | 54 | @classmethod 55 | def from_pretrained(self, *args, **kwargs): 56 | requires_sentencepiece(self) 57 | 58 | 59 | class MT5Tokenizer: 60 | def __init__(self, *args, **kwargs): 61 | requires_sentencepiece(self) 62 | 63 | @classmethod 64 | def from_pretrained(self, *args, **kwargs): 65 | requires_sentencepiece(self) 66 | 67 | 68 | class PegasusTokenizer: 69 | def __init__(self, *args, **kwargs): 70 | requires_sentencepiece(self) 71 | 72 | @classmethod 73 | def from_pretrained(self, *args, **kwargs): 74 | requires_sentencepiece(self) 75 | 76 | 77 | class ReformerTokenizer: 78 | def __init__(self, *args, **kwargs): 79 | requires_sentencepiece(self) 80 | 81 | @classmethod 82 | def from_pretrained(self, *args, **kwargs): 83 | requires_sentencepiece(self) 84 | 85 | 86 | class T5Tokenizer: 87 | def __init__(self, *args, **kwargs): 88 | requires_sentencepiece(self) 89 | 90 | @classmethod 91 | def from_pretrained(self, *args, **kwargs): 92 | requires_sentencepiece(self) 93 | 94 | 95 | class XLMProphetNetTokenizer: 96 | def __init__(self, *args, **kwargs): 97 | requires_sentencepiece(self) 98 | 99 | @classmethod 100 | def from_pretrained(self, *args, **kwargs): 101 | requires_sentencepiece(self) 102 | 103 | 104 | class XLMRobertaTokenizer: 105 | def __init__(self, *args, **kwargs): 106 | requires_sentencepiece(self) 107 | 108 | @classmethod 109 | def from_pretrained(self, *args, **kwargs): 110 | requires_sentencepiece(self) 111 | 112 | 113 | class XLNetTokenizer: 114 | def __init__(self, *args, **kwargs): 115 | requires_sentencepiece(self) 116 | 117 | @classmethod 118 | def from_pretrained(self, *args, **kwargs): 119 | requires_sentencepiece(self) 120 | -------------------------------------------------------------------------------- /transformers/activations.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import math 16 | 17 | import torch 18 | import torch.nn.functional as F 19 | from packaging import version 20 | 21 | from .utils import logging 22 | 23 | 24 | logger = logging.get_logger(__name__) 25 | 26 | 27 | def _gelu_python(x): 28 | """ 29 | Original Implementation of the GELU activation function in Google BERT repo when initially created. For 30 | information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 + 31 | torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in 32 | torch.nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 33 | """ 34 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 35 | 36 | 37 | def gelu_new(x): 38 | """ 39 | Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see 40 | the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 41 | """ 42 | return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) 43 | 44 | 45 | if version.parse(torch.__version__) < version.parse("1.4"): 46 | gelu = _gelu_python 47 | else: 48 | gelu = F.gelu 49 | 50 | 51 | def gelu_fast(x): 52 | return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) 53 | 54 | 55 | def _silu_python(x): 56 | """ 57 | See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear 58 | Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function 59 | Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated 60 | Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with 61 | later. 62 | """ 63 | return x * torch.sigmoid(x) 64 | 65 | 66 | if version.parse(torch.__version__) < version.parse("1.7"): 67 | silu = _silu_python 68 | else: 69 | silu = F.silu 70 | 71 | 72 | def mish(x): 73 | return x * torch.tanh(torch.nn.functional.softplus(x)) 74 | 75 | 76 | def linear_act(x): 77 | return x 78 | 79 | 80 | ACT2FN = { 81 | "relu": F.relu, 82 | "silu": silu, 83 | "swish": silu, 84 | "gelu": gelu, 85 | "tanh": torch.tanh, 86 | "gelu_new": gelu_new, 87 | "gelu_fast": gelu_fast, 88 | "mish": mish, 89 | "linear": linear_act, 90 | "sigmoid": torch.sigmoid, 91 | } 92 | 93 | 94 | def get_activation(activation_string): 95 | if activation_string in ACT2FN: 96 | return ACT2FN[activation_string] 97 | else: 98 | raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys()))) 99 | -------------------------------------------------------------------------------- /transformers/pipelines/text_classification.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available 4 | from .base import PIPELINE_INIT_ARGS, Pipeline 5 | 6 | 7 | if is_tf_available(): 8 | from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING 9 | 10 | if is_torch_available(): 11 | from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING 12 | 13 | 14 | @add_end_docstrings( 15 | PIPELINE_INIT_ARGS, 16 | r""" 17 | return_all_scores (:obj:`bool`, `optional`, defaults to :obj:`False`): 18 | Whether to return all prediction scores or just the one of the predicted class. 19 | """, 20 | ) 21 | class TextClassificationPipeline(Pipeline): 22 | """ 23 | Text classification pipeline using any :obj:`ModelForSequenceClassification`. See the `sequence classification 24 | examples <../task_summary.html#sequence-classification>`__ for more information. 25 | 26 | This text classification pipeline can currently be loaded from :func:`~transformers.pipeline` using the following 27 | task identifier: :obj:`"sentiment-analysis"` (for classifying sequences according to positive or negative 28 | sentiments). 29 | 30 | If multiple classification labels are available (:obj:`model.config.num_labels >= 2`), the pipeline will run a 31 | softmax over the results. If there is a single label, the pipeline will run a sigmoid over the result. 32 | 33 | The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See 34 | the up-to-date list of available models on `huggingface.co/models 35 | `__. 36 | """ 37 | 38 | def __init__(self, return_all_scores: bool = False, **kwargs): 39 | super().__init__(**kwargs) 40 | 41 | self.check_model_type( 42 | TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING 43 | if self.framework == "tf" 44 | else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING 45 | ) 46 | 47 | self.return_all_scores = return_all_scores 48 | 49 | def __call__(self, *args, **kwargs): 50 | """ 51 | Classify the text(s) given as inputs. 52 | 53 | Args: 54 | args (:obj:`str` or :obj:`List[str]`): 55 | One or several texts (or one list of prompts) to classify. 56 | 57 | Return: 58 | A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys: 59 | 60 | - **label** (:obj:`str`) -- The label predicted. 61 | - **score** (:obj:`float`) -- The corresponding probability. 62 | 63 | If ``self.return_all_scores=True``, one such dictionary is returned per label. 64 | """ 65 | outputs = super().__call__(*args, **kwargs) 66 | 67 | if self.model.config.num_labels == 1: 68 | scores = 1.0 / (1.0 + np.exp(-outputs)) 69 | else: 70 | scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True) 71 | if self.return_all_scores: 72 | return [ 73 | [{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(item)] 74 | for item in scores 75 | ] 76 | else: 77 | return [ 78 | {"label": self.model.config.id2label[item.argmax()], "score": item.max().item()} for item in scores 79 | ] 80 | -------------------------------------------------------------------------------- /transformers/models/lxmert/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_tf_available, is_tokenizers_available, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_lxmert": ["LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LxmertConfig"], 26 | "tokenization_lxmert": ["LxmertTokenizer"], 27 | } 28 | 29 | if is_tokenizers_available(): 30 | _import_structure["tokenization_lxmert_fast"] = ["LxmertTokenizerFast"] 31 | 32 | if is_torch_available(): 33 | _import_structure["modeling_lxmert"] = [ 34 | "LxmertEncoder", 35 | "LxmertForPreTraining", 36 | "LxmertForQuestionAnswering", 37 | "LxmertModel", 38 | "LxmertPreTrainedModel", 39 | "LxmertVisualFeatureEncoder", 40 | "LxmertXLayer", 41 | ] 42 | 43 | if is_tf_available(): 44 | _import_structure["modeling_tf_lxmert"] = [ 45 | "TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST", 46 | "TFLxmertForPreTraining", 47 | "TFLxmertMainLayer", 48 | "TFLxmertModel", 49 | "TFLxmertPreTrainedModel", 50 | "TFLxmertVisualFeatureEncoder", 51 | ] 52 | 53 | 54 | if TYPE_CHECKING: 55 | from .configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig 56 | from .tokenization_lxmert import LxmertTokenizer 57 | 58 | if is_tokenizers_available(): 59 | from .tokenization_lxmert_fast import LxmertTokenizerFast 60 | 61 | if is_torch_available(): 62 | from .modeling_lxmert import ( 63 | LxmertEncoder, 64 | LxmertForPreTraining, 65 | LxmertForQuestionAnswering, 66 | LxmertModel, 67 | LxmertPreTrainedModel, 68 | LxmertVisualFeatureEncoder, 69 | LxmertXLayer, 70 | ) 71 | 72 | if is_tf_available(): 73 | from .modeling_tf_lxmert import ( 74 | TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST, 75 | TFLxmertForPreTraining, 76 | TFLxmertMainLayer, 77 | TFLxmertModel, 78 | TFLxmertPreTrainedModel, 79 | TFLxmertVisualFeatureEncoder, 80 | ) 81 | 82 | else: 83 | import importlib 84 | import os 85 | import sys 86 | 87 | class _LazyModule(_BaseLazyModule): 88 | """ 89 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 90 | """ 91 | 92 | __file__ = globals()["__file__"] 93 | __path__ = [os.path.dirname(__file__)] 94 | 95 | def _get_module(self, module_name: str): 96 | return importlib.import_module("." + module_name, self.__name__) 97 | 98 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 99 | -------------------------------------------------------------------------------- /transformers/models/transfo_xl/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from typing import TYPE_CHECKING 20 | 21 | from ...file_utils import _BaseLazyModule, is_tf_available, is_torch_available 22 | 23 | 24 | _import_structure = { 25 | "configuration_transfo_xl": ["TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP", "TransfoXLConfig"], 26 | "tokenization_transfo_xl": ["TransfoXLCorpus", "TransfoXLTokenizer"], 27 | } 28 | 29 | if is_torch_available(): 30 | _import_structure["modeling_transfo_xl"] = [ 31 | "TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST", 32 | "AdaptiveEmbedding", 33 | "TransfoXLForSequenceClassification", 34 | "TransfoXLLMHeadModel", 35 | "TransfoXLModel", 36 | "TransfoXLPreTrainedModel", 37 | "load_tf_weights_in_transfo_xl", 38 | ] 39 | 40 | if is_tf_available(): 41 | _import_structure["modeling_tf_transfo_xl"] = [ 42 | "TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST", 43 | "TFAdaptiveEmbedding", 44 | "TFTransfoXLForSequenceClassification", 45 | "TFTransfoXLLMHeadModel", 46 | "TFTransfoXLMainLayer", 47 | "TFTransfoXLModel", 48 | "TFTransfoXLPreTrainedModel", 49 | ] 50 | 51 | 52 | if TYPE_CHECKING: 53 | from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig 54 | from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer 55 | 56 | if is_torch_available(): 57 | from .modeling_transfo_xl import ( 58 | TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST, 59 | AdaptiveEmbedding, 60 | TransfoXLForSequenceClassification, 61 | TransfoXLLMHeadModel, 62 | TransfoXLModel, 63 | TransfoXLPreTrainedModel, 64 | load_tf_weights_in_transfo_xl, 65 | ) 66 | 67 | if is_tf_available(): 68 | from .modeling_tf_transfo_xl import ( 69 | TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST, 70 | TFAdaptiveEmbedding, 71 | TFTransfoXLForSequenceClassification, 72 | TFTransfoXLLMHeadModel, 73 | TFTransfoXLMainLayer, 74 | TFTransfoXLModel, 75 | TFTransfoXLPreTrainedModel, 76 | ) 77 | 78 | else: 79 | import importlib 80 | import os 81 | import sys 82 | 83 | class _LazyModule(_BaseLazyModule): 84 | """ 85 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 86 | """ 87 | 88 | __file__ = globals()["__file__"] 89 | __path__ = [os.path.dirname(__file__)] 90 | 91 | def _get_module(self, module_name: str): 92 | return importlib.import_module("." + module_name, self.__name__) 93 | 94 | sys.modules[__name__] = _LazyModule(__name__, _import_structure) 95 | --------------------------------------------------------------------------------