├── transformers
    ├── utils
    │   ├── __init__.py
    │   ├── dummy_flax_objects.py
    │   ├── model_parallel_utils.py
    │   └── dummy_sentencepiece_objects.py
    ├── benchmark
    │   └── __init__.py
    ├── models
    │   ├── dialogpt
    │   │   ├── __init__.py
    │   │   └── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
    │   ├── xlm_prophetnet
    │   │   ├── configuration_xlm_prophetnet.py
    │   │   └── __init__.py
    │   ├── camembert
    │   │   └── configuration_camembert.py
    │   ├── __init__.py
    │   ├── phobert
    │   │   └── __init__.py
    │   ├── bertweet
    │   │   └── __init__.py
    │   ├── bert_japanese
    │   │   └── __init__.py
    │   ├── mmbt
    │   │   ├── configuration_mmbt.py
    │   │   └── __init__.py
    │   ├── mobilebert
    │   │   ├── tokenization_mobilebert.py
    │   │   ├── tokenization_mobilebert_fast.py
    │   │   └── convert_mobilebert_original_tf_checkpoint_to_pytorch.py
    │   ├── herbert
    │   │   ├── __init__.py
    │   │   └── tokenization_herbert.py
    │   ├── encoder_decoder
    │   │   └── __init__.py
    │   ├── bart
    │   │   ├── tokenization_bart_fast.py
    │   │   ├── tokenization_bart.py
    │   │   └── __init__.py
    │   ├── led
    │   │   ├── tokenization_led.py
    │   │   ├── tokenization_led_fast.py
    │   │   └── __init__.py
    │   ├── retribert
    │   │   ├── tokenization_retribert.py
    │   │   ├── tokenization_retribert_fast.py
    │   │   └── __init__.py
    │   ├── xlm_roberta
    │   │   └── configuration_xlm_roberta.py
    │   ├── barthez
    │   │   └── __init__.py
    │   ├── fsmt
    │   │   └── __init__.py
    │   ├── rag
    │   │   ├── __init__.py
    │   │   └── tokenization_rag.py
    │   ├── layoutlm
    │   │   ├── tokenization_layoutlm.py
    │   │   ├── tokenization_layoutlm_fast.py
    │   │   └── __init__.py
    │   ├── mbart
    │   │   ├── convert_mbart_original_checkpoint_to_pytorch.py
    │   │   └── __init__.py
    │   ├── longformer
    │   │   ├── tokenization_longformer.py
    │   │   ├── tokenization_longformer_fast.py
    │   │   └── convert_longformer_original_pytorch_lightning_to_pytorch.py
    │   ├── t5
    │   │   └── convert_t5_original_tf_checkpoint_to_pytorch.py
    │   ├── lxmert
    │   │   ├── convert_lxmert_original_tf_checkpoint_to_pytorch.py
    │   │   ├── tokenization_lxmert.py
    │   │   ├── tokenization_lxmert_fast.py
    │   │   └── __init__.py
    │   ├── bert
    │   │   └── convert_bert_original_tf_checkpoint_to_pytorch.py
    │   ├── albert
    │   │   └── convert_albert_original_tf_checkpoint_to_pytorch.py
    │   ├── tapas
    │   │   └── __init__.py
    │   ├── bert_generation
    │   │   └── __init__.py
    │   ├── squeezebert
    │   │   ├── tokenization_squeezebert.py
    │   │   ├── __init__.py
    │   │   └── tokenization_squeezebert_fast.py
    │   ├── funnel
    │   │   └── convert_funnel_original_tf_checkpoint_to_pytorch.py
    │   ├── deberta
    │   │   └── __init__.py
    │   ├── prophetnet
    │   │   └── __init__.py
    │   ├── blenderbot
    │   │   └── __init__.py
    │   ├── gpt2
    │   │   └── convert_gpt2_original_tf_checkpoint_to_pytorch.py
    │   ├── marian
    │   │   └── __init__.py
    │   ├── blenderbot_small
    │   │   └── __init__.py
    │   ├── openai
    │   │   ├── convert_openai_original_tf_checkpoint_to_pytorch.py
    │   │   └── tokenization_openai_fast.py
    │   ├── ctrl
    │   │   └── __init__.py
    │   ├── roberta
    │   │   └── configuration_roberta.py
    │   ├── reformer
    │   │   └── __init__.py
    │   ├── electra
    │   │   ├── tokenization_electra.py
    │   │   └── convert_electra_original_tf_checkpoint_to_pytorch.py
    │   ├── pegasus
    │   │   └── __init__.py
    │   ├── distilbert
    │   │   └── tokenization_distilbert.py
    │   ├── mt5
    │   │   └── __init__.py
    │   ├── xlm
    │   │   └── convert_xlm_original_pytorch_checkpoint_to_pytorch.py
    │   └── transfo_xl
    │   │   └── __init__.py
    ├── commands
    │   ├── __init__.py
    │   ├── download.py
    │   ├── transformers_cli.py
    │   └── env.py
    ├── data
    │   ├── datasets
    │   │   └── __init__.py
    │   ├── processors
    │   │   └── __init__.py
    │   └── __init__.py
    ├── dependency_versions_check.py
    ├── training_args_seq2seq.py
    ├── dependency_versions_table.py
    ├── activations_tf.py
    ├── convert_tf_hub_seq_to_seq_bert_to_pytorch.py
    ├── activations.py
    └── pipelines
    │   └── text_classification.py
├── test_command.txt
├── requirements.txt
├── README.md
└── .gitignore


/transformers/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/transformers/benchmark/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/transformers/models/dialogpt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test_command.txt:
--------------------------------------------------------------------------------
1 | python run.py      --model_name_or_path facebook/bart-base     --do_train     --do_eval     --task summarization   --train_file data/news_summary_train_small.csv    --validation_file data/news_summary_valid_small.csv     --output_dir output/    --overwrite_output_dir   --num_beams=3     --min_summ_length=100     --max_summ_length=320     --length_penalty=1.0     --per_device_train_batch_size=4     --per_device_eval_batch_size=4     --predict_with_generate     --text_column Text     --summary_column Summary 
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.11.0
 2 | certifi==2020.12.5
 3 | chardet==4.0.0
 4 | click==7.1.2
 5 | datasets==1.2.1
 6 | dill==0.3.3
 7 | filelock==3.0.12
 8 | idna==2.10
 9 | joblib==1.0.0
10 | multiprocess==0.70.11.1
11 | nltk==3.5
12 | numpy==1.19.5
13 | packaging==20.8
14 | pandas==1.2.1
15 | pyarrow==2.0.0
16 | pyparsing==2.4.7
17 | python-dateutil==2.8.1
18 | pytz==2020.5
19 | regex==2020.11.13
20 | requests==2.25.1
21 | rouge-score==0.0.4
22 | sacremoses==0.0.43
23 | six==1.15.0
24 | tokenizers==0.9.4
25 | torch==1.7.1
26 | tqdm==4.49.0
27 | transformers==4.2.2
28 | typing-extensions==3.7.4.3
29 | urllib3==1.26.2
30 | xxhash==2.0.0
31 | 


--------------------------------------------------------------------------------
/transformers/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from abc import ABC, abstractmethod
16 | from argparse import ArgumentParser
17 | 
18 | 
19 | class BaseTransformersCLICommand(ABC):
20 |     @staticmethod
21 |     @abstractmethod
22 |     def register_subcommand(parser: ArgumentParser):
23 |         raise NotImplementedError()
24 | 
25 |     @abstractmethod
26 |     def run(self):
27 |         raise NotImplementedError()
28 | 


--------------------------------------------------------------------------------
/transformers/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from .glue import GlueDataset, GlueDataTrainingArguments
20 | from .language_modeling import (
21 |     LineByLineTextDataset,
22 |     LineByLineWithRefDataset,
23 |     LineByLineWithSOPTextDataset,
24 |     TextDataset,
25 |     TextDatasetForNextSentencePrediction,
26 | )
27 | from .squad import SquadDataset, SquadDataTrainingArguments
28 | 


--------------------------------------------------------------------------------
/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
20 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
21 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
22 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
23 | 


--------------------------------------------------------------------------------
/transformers/utils/dummy_flax_objects.py:
--------------------------------------------------------------------------------
 1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
 2 | from ..file_utils import requires_flax
 3 | 
 4 | 
 5 | class FlaxPreTrainedModel:
 6 |     def __init__(self, *args, **kwargs):
 7 |         requires_flax(self)
 8 | 
 9 |     @classmethod
10 |     def from_pretrained(self, *args, **kwargs):
11 |         requires_flax(self)
12 | 
13 | 
14 | FLAX_MODEL_MAPPING = None
15 | 
16 | 
17 | class FlaxAutoModel:
18 |     def __init__(self, *args, **kwargs):
19 |         requires_flax(self)
20 | 
21 |     @classmethod
22 |     def from_pretrained(self, *args, **kwargs):
23 |         requires_flax(self)
24 | 
25 | 
26 | class FlaxBertForMaskedLM:
27 |     def __init__(self, *args, **kwargs):
28 |         requires_flax(self)
29 | 
30 |     @classmethod
31 |     def from_pretrained(self, *args, **kwargs):
32 |         requires_flax(self)
33 | 
34 | 
35 | class FlaxBertModel:
36 |     def __init__(self, *args, **kwargs):
37 |         requires_flax(self)
38 | 
39 |     @classmethod
40 |     def from_pretrained(self, *args, **kwargs):
41 |         requires_flax(self)
42 | 
43 | 
44 | class FlaxRobertaModel:
45 |     def __init__(self, *args, **kwargs):
46 |         requires_flax(self)
47 | 
48 |     @classmethod
49 |     def from_pretrained(self, *args, **kwargs):
50 |         requires_flax(self)
51 | 


--------------------------------------------------------------------------------
/transformers/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from .metrics import glue_compute_metrics, xnli_compute_metrics
20 | from .processors import (
21 |     DataProcessor,
22 |     InputExample,
23 |     InputFeatures,
24 |     SingleSentenceClassificationProcessor,
25 |     SquadExample,
26 |     SquadFeatures,
27 |     SquadV1Processor,
28 |     SquadV2Processor,
29 |     glue_convert_examples_to_features,
30 |     glue_output_modes,
31 |     glue_processors,
32 |     glue_tasks_num_labels,
33 |     squad_convert_examples_to_features,
34 |     xnli_output_modes,
35 |     xnli_processors,
36 |     xnli_tasks_num_labels,
37 | )
38 | 


--------------------------------------------------------------------------------
/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ XLM-ProphetNet model configuration """
16 | 
17 | 
18 | from ...utils import logging
19 | from ..prophetnet.configuration_prophetnet import ProphetNetConfig
20 | 
21 | 
22 | logger = logging.get_logger(__name__)
23 | 
24 | XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25 |     "microsoft/xprophetnet-large-wiki100-cased": "https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased/resolve/main/config.json",
26 | }
27 | 
28 | 
29 | class XLMProphetNetConfig(ProphetNetConfig):
30 |     """
31 |     This class overrides :class:`~transformers.ProphetNetConfig`. Please check the superclass for the appropriate
32 |     documentation alongside usage examples.
33 |     """
34 | 
35 |     model_type = "xlm-prophetnet"
36 | 


--------------------------------------------------------------------------------
/transformers/models/xlm_prophetnet/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from ...file_utils import is_sentencepiece_available, is_torch_available
20 | from .configuration_xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig
21 | 
22 | 
23 | if is_sentencepiece_available():
24 |     from .tokenization_xlm_prophetnet import XLMProphetNetTokenizer
25 | 
26 | if is_torch_available():
27 |     from .modeling_xlm_prophetnet import (
28 |         XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
29 |         XLMProphetNetDecoder,
30 |         XLMProphetNetEncoder,
31 |         XLMProphetNetForCausalLM,
32 |         XLMProphetNetForConditionalGeneration,
33 |         XLMProphetNetModel,
34 |     )
35 | 


--------------------------------------------------------------------------------
/transformers/models/camembert/configuration_camembert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ CamemBERT configuration """
17 | 
18 | from ...utils import logging
19 | from ..roberta.configuration_roberta import RobertaConfig
20 | 
21 | 
22 | logger = logging.get_logger(__name__)
23 | 
24 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25 |     "camembert-base": "https://huggingface.co/camembert-base/resolve/main/config.json",
26 |     "umberto-commoncrawl-cased-v1": "https://huggingface.co/Musixmatch/umberto-commoncrawl-cased-v1/resolve/main/config.json",
27 |     "umberto-wikipedia-uncased-v1": "https://huggingface.co/Musixmatch/umberto-wikipedia-uncased-v1/resolve/main/config.json",
28 | }
29 | 
30 | 
31 | class CamembertConfig(RobertaConfig):
32 |     """
33 |     This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate
34 |     documentation alongside usage examples.
35 |     """
36 | 
37 |     model_type = "camembert"
38 | 


--------------------------------------------------------------------------------
/transformers/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from . import (
20 |     albert,
21 |     auto,
22 |     bart,
23 |     barthez,
24 |     bert,
25 |     bert_generation,
26 |     bert_japanese,
27 |     bertweet,
28 |     blenderbot,
29 |     blenderbot_small,
30 |     camembert,
31 |     ctrl,
32 |     deberta,
33 |     dialogpt,
34 |     distilbert,
35 |     dpr,
36 |     electra,
37 |     encoder_decoder,
38 |     flaubert,
39 |     fsmt,
40 |     funnel,
41 |     gpt2,
42 |     herbert,
43 |     layoutlm,
44 |     led,
45 |     longformer,
46 |     lxmert,
47 |     marian,
48 |     mbart,
49 |     mmbt,
50 |     mobilebert,
51 |     mpnet,
52 |     mt5,
53 |     openai,
54 |     pegasus,
55 |     phobert,
56 |     prophetnet,
57 |     rag,
58 |     reformer,
59 |     retribert,
60 |     roberta,
61 |     squeezebert,
62 |     t5,
63 |     tapas,
64 |     transfo_xl,
65 |     xlm,
66 |     xlm_roberta,
67 |     xlnet,
68 | )
69 | 


--------------------------------------------------------------------------------
/transformers/models/phobert/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule
22 | 
23 | 
24 | _import_structure = {
25 |     "tokenization_phobert": ["PhobertTokenizer"],
26 | }
27 | 
28 | 
29 | if TYPE_CHECKING:
30 |     from .tokenization_phobert import PhobertTokenizer
31 | 
32 | else:
33 |     import importlib
34 |     import os
35 |     import sys
36 | 
37 |     class _LazyModule(_BaseLazyModule):
38 |         """
39 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
40 |         """
41 | 
42 |         __file__ = globals()["__file__"]
43 |         __path__ = [os.path.dirname(__file__)]
44 | 
45 |         def _get_module(self, module_name: str):
46 |             return importlib.import_module("." + module_name, self.__name__)
47 | 
48 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
49 | 


--------------------------------------------------------------------------------
/transformers/models/bertweet/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule
22 | 
23 | 
24 | _import_structure = {
25 |     "tokenization_bertweet": ["BertweetTokenizer"],
26 | }
27 | 
28 | 
29 | if TYPE_CHECKING:
30 |     from .tokenization_bertweet import BertweetTokenizer
31 | 
32 | else:
33 |     import importlib
34 |     import os
35 |     import sys
36 | 
37 |     class _LazyModule(_BaseLazyModule):
38 |         """
39 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
40 |         """
41 | 
42 |         __file__ = globals()["__file__"]
43 |         __path__ = [os.path.dirname(__file__)]
44 | 
45 |         def _get_module(self, module_name: str):
46 |             return importlib.import_module("." + module_name, self.__name__)
47 | 
48 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
49 | 


--------------------------------------------------------------------------------
/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | import os
17 | 
18 | import torch
19 | 
20 | from transformers.file_utils import WEIGHTS_NAME
21 | 
22 | 
23 | DIALOGPT_MODELS = ["small", "medium", "large"]
24 | 
25 | OLD_KEY = "lm_head.decoder.weight"
26 | NEW_KEY = "lm_head.weight"
27 | 
28 | 
29 | def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str):
30 |     d = torch.load(checkpoint_path)
31 |     d[NEW_KEY] = d.pop(OLD_KEY)
32 |     os.makedirs(pytorch_dump_folder_path, exist_ok=True)
33 |     torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME))
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument("--dialogpt_path", default=".", type=str)
39 |     args = parser.parse_args()
40 |     for MODEL in DIALOGPT_MODELS:
41 |         checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl")
42 |         pytorch_dump_folder_path = f"./DialoGPT-{MODEL}"
43 |         convert_dialogpt_checkpoint(
44 |             checkpoint_path,
45 |             pytorch_dump_folder_path,
46 |         )
47 | 


--------------------------------------------------------------------------------
/transformers/models/bert_japanese/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule
22 | 
23 | 
24 | _import_structure = {
25 |     "tokenization_bert_japanese": ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"],
26 | }
27 | 
28 | 
29 | if TYPE_CHECKING:
30 |     from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
31 | 
32 | else:
33 |     import importlib
34 |     import os
35 |     import sys
36 | 
37 |     class _LazyModule(_BaseLazyModule):
38 |         """
39 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
40 |         """
41 | 
42 |         __file__ = globals()["__file__"]
43 |         __path__ = [os.path.dirname(__file__)]
44 | 
45 |         def _get_module(self, module_name: str):
46 |             return importlib.import_module("." + module_name, self.__name__)
47 | 
48 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
49 | 


--------------------------------------------------------------------------------
/transformers/models/mmbt/configuration_mmbt.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # Copyright (c) HuggingFace Inc. team.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ MMBT configuration """
17 | 
18 | from ...utils import logging
19 | 
20 | 
21 | logger = logging.get_logger(__name__)
22 | 
23 | 
24 | class MMBTConfig(object):
25 |     """
26 |     This is the configuration class to store the configuration of a :class:`~transformers.MMBTModel`. It is used to
27 |     instantiate a MMBT model according to the specified arguments, defining the model architecture.
28 | 
29 |     Args:
30 |         config (:class:`~transformers.PreTrainedConfig`):
31 |             Config of the underlying Transformer models. Its values are copied over to use a single config.
32 |         num_labels (:obj:`int`, `optional`):
33 |             Size of final Linear layer for classification.
34 |         modal_hidden_size (:obj:`int`, `optional`, defaults to 2048):
35 |             Embedding dimension of the non-text modality encoder.
36 |     """
37 | 
38 |     def __init__(self, config, num_labels=None, modal_hidden_size=2048):
39 |         self.__dict__ = config.__dict__
40 |         self.modal_hidden_size = modal_hidden_size
41 |         if num_labels:
42 |             self.num_labels = num_labels
43 | 


--------------------------------------------------------------------------------
/transformers/dependency_versions_check.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import sys
15 | 
16 | from .dependency_versions_table import deps
17 | from .utils.versions import require_version_core
18 | 
19 | 
20 | # define which module versions we always want to check at run time
21 | # (usually the ones defined in `install_requires` in setup.py)
22 | #
23 | # order specific notes:
24 | # - tqdm must be checked before tokenizers
25 | 
26 | pkgs_to_check_at_runtime = "python tqdm regex sacremoses requests packaging filelock numpy tokenizers".split()
27 | if sys.version_info < (3, 7):
28 |     pkgs_to_check_at_runtime.append("dataclasses")
29 | if sys.version_info < (3, 8):
30 |     pkgs_to_check_at_runtime.append("importlib_metadata")
31 | 
32 | for pkg in pkgs_to_check_at_runtime:
33 |     if pkg in deps:
34 |         if pkg == "tokenizers":
35 |             # must be loaded here, or else tqdm check may fail
36 |             from .file_utils import is_tokenizers_available
37 | 
38 |             if not is_tokenizers_available():
39 |                 continue  # not required, check version only if installed
40 | 
41 |         require_version_core(deps[pkg])
42 |     else:
43 |         raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py")
44 | 


--------------------------------------------------------------------------------
/transformers/training_args_seq2seq.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import logging
16 | from dataclasses import dataclass, field
17 | 
18 | from .file_utils import add_start_docstrings
19 | from .training_args import TrainingArguments
20 | 
21 | 
22 | logger = logging.getLogger(__name__)
23 | 
24 | 
25 | @dataclass
26 | @add_start_docstrings(TrainingArguments.__doc__)
27 | class Seq2SeqTrainingArguments(TrainingArguments):
28 |     """
29 |     sortish_sampler (:obj:`bool`, `optional`, defaults to :obj:`False`):
30 |         Whether to use a `sortish sampler` or not. Only possible if the underlying datasets are `Seq2SeqDataset` for
31 |         now but will become generally available in the near future.
32 | 
33 |         It sorts the inputs according to lengths in order to minimize the padding size, with a bit of randomness for
34 |         the training set.
35 |     predict_with_generate (:obj:`bool`, `optional`, defaults to :obj:`False`):
36 |         Whether to use generate to calculate generative metrics (ROUGE, BLEU).
37 |     """
38 | 
39 |     sortish_sampler: bool = field(default=False, metadata={"help": "Whether to use SortishSampler or not."})
40 |     predict_with_generate: bool = field(
41 |         default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
42 |     )
43 | 


--------------------------------------------------------------------------------
/transformers/models/mobilebert/tokenization_mobilebert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | #
 3 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """Tokenization classes for MobileBERT."""
17 | 
18 | from ...utils import logging
19 | from ..bert.tokenization_bert import BertTokenizer
20 | 
21 | 
22 | logger = logging.get_logger(__name__)
23 | 
24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
25 | 
26 | PRETRAINED_VOCAB_FILES_MAP = {
27 |     "vocab_file": {"mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/vocab.txt"}
28 | }
29 | 
30 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512}
31 | 
32 | 
33 | PRETRAINED_INIT_CONFIGURATION = {}
34 | 
35 | 
36 | class MobileBertTokenizer(BertTokenizer):
37 |     r"""
38 |     Construct a MobileBERT tokenizer.
39 | 
40 |     :class:`~transformers.MobileBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
41 |     tokenization: punctuation splitting and wordpiece.
42 | 
43 |     Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
44 |     parameters.
45 |     """
46 | 
47 |     vocab_files_names = VOCAB_FILES_NAMES
48 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
49 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
50 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
51 | 


--------------------------------------------------------------------------------
/transformers/models/herbert/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_tokenizers_available
22 | 
23 | 
24 | _import_structure = {
25 |     "tokenization_herbert": ["HerbertTokenizer"],
26 | }
27 | 
28 | if is_tokenizers_available():
29 |     _import_structure["tokenization_herbert_fast"] = ["HerbertTokenizerFast"]
30 | 
31 | 
32 | if TYPE_CHECKING:
33 |     from .tokenization_herbert import HerbertTokenizer
34 | 
35 |     if is_tokenizers_available():
36 |         from .tokenization_herbert_fast import HerbertTokenizerFast
37 | 
38 | else:
39 |     import importlib
40 |     import os
41 |     import sys
42 | 
43 |     class _LazyModule(_BaseLazyModule):
44 |         """
45 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
46 |         """
47 | 
48 |         __file__ = globals()["__file__"]
49 |         __path__ = [os.path.dirname(__file__)]
50 | 
51 |         def _get_module(self, module_name: str):
52 |             return importlib.import_module("." + module_name, self.__name__)
53 | 
54 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
55 | 


--------------------------------------------------------------------------------
/transformers/models/mmbt/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_mmbt": ["MMBTConfig"],
26 | }
27 | 
28 | if is_torch_available():
29 |     _import_structure["modeling_mmbt"] = ["MMBTForClassification", "MMBTModel", "ModalEmbeddings"]
30 | 
31 | 
32 | if TYPE_CHECKING:
33 |     from .configuration_mmbt import MMBTConfig
34 | 
35 |     if is_torch_available():
36 |         from .modeling_mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
37 | 
38 | else:
39 |     import importlib
40 |     import os
41 |     import sys
42 | 
43 |     class _LazyModule(_BaseLazyModule):
44 |         """
45 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
46 |         """
47 | 
48 |         __file__ = globals()["__file__"]
49 |         __path__ = [os.path.dirname(__file__)]
50 | 
51 |         def _get_module(self, module_name: str):
52 |             return importlib.import_module("." + module_name, self.__name__)
53 | 
54 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
55 | 


--------------------------------------------------------------------------------
/transformers/models/encoder_decoder/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_encoder_decoder": ["EncoderDecoderConfig"],
26 | }
27 | 
28 | if is_torch_available():
29 |     _import_structure["modeling_encoder_decoder"] = ["EncoderDecoderModel"]
30 | 
31 | 
32 | if TYPE_CHECKING:
33 |     from .configuration_encoder_decoder import EncoderDecoderConfig
34 | 
35 |     if is_torch_available():
36 |         from .modeling_encoder_decoder import EncoderDecoderModel
37 | 
38 | else:
39 |     import importlib
40 |     import os
41 |     import sys
42 | 
43 |     class _LazyModule(_BaseLazyModule):
44 |         """
45 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
46 |         """
47 | 
48 |         __file__ = globals()["__file__"]
49 |         __path__ = [os.path.dirname(__file__)]
50 | 
51 |         def _get_module(self, module_name: str):
52 |             return importlib.import_module("." + module_name, self.__name__)
53 | 
54 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
55 | 


--------------------------------------------------------------------------------
/transformers/models/bart/tokenization_bart_fast.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from ...utils import logging
17 | from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast
18 | from .tokenization_bart import BartTokenizer
19 | 
20 | 
21 | logger = logging.get_logger(__name__)
22 | 
23 | 
24 | # vocab and merges same as roberta
25 | vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
26 | merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
27 | tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json"
28 | _all_bart_models = [
29 |     "facebook/bart-base",
30 |     "facebook/bart-large",
31 |     "facebook/bart-large-mnli",
32 |     "facebook/bart-large-cnn",
33 |     "facebook/bart-large-xsum",
34 |     "yjernite/bart_eli5",
35 |     # This is not exhaustive: see https://huggingface.co/models?filter=bart
36 | ]
37 | 
38 | 
39 | class BartTokenizerFast(RobertaTokenizerFast):
40 |     # merges and vocab same as Roberta
41 |     max_model_input_sizes = {m: 1024 for m in _all_bart_models}
42 |     pretrained_vocab_files_map = {
43 |         "vocab_file": {m: vocab_url for m in _all_bart_models},
44 |         "merges_file": {m: merges_url for m in _all_bart_models},
45 |         "tokenizer_file": {m: tokenizer_url for m in _all_bart_models},
46 |     }
47 |     slow_tokenizer_class = BartTokenizer
48 | 


--------------------------------------------------------------------------------
/transformers/dependency_versions_table.py:
--------------------------------------------------------------------------------
 1 | # THIS FILE HAS BEEN AUTOGENERATED. To update:
 2 | # 1. modify the `_deps` dict in setup.py
 3 | # 2. run `make deps_table_update``
 4 | deps = {
 5 |     "black": "black>=20.8b1",
 6 |     "cookiecutter": "cookiecutter==1.7.2",
 7 |     "dataclasses": "dataclasses",
 8 |     "datasets": "datasets",
 9 |     "faiss-cpu": "faiss-cpu",
10 |     "fastapi": "fastapi",
11 |     "filelock": "filelock",
12 |     "flake8": "flake8>=3.8.3",
13 |     "flax": "flax>=0.2.2",
14 |     "fugashi": "fugashi>=1.0",
15 |     "importlib_metadata": "importlib_metadata",
16 |     "ipadic": "ipadic>=1.0.0,<2.0",
17 |     "isort": "isort>=5.5.4",
18 |     "jax": "jax>=0.2.0",
19 |     "jaxlib": "jaxlib==0.1.55",
20 |     "keras2onnx": "keras2onnx",
21 |     "numpy": "numpy",
22 |     "onnxconverter-common": "onnxconverter-common",
23 |     "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
24 |     "onnxruntime": "onnxruntime>=1.4.0",
25 |     "packaging": "packaging",
26 |     "parameterized": "parameterized",
27 |     "protobuf": "protobuf",
28 |     "psutil": "psutil",
29 |     "pydantic": "pydantic",
30 |     "pytest": "pytest",
31 |     "pytest-xdist": "pytest-xdist",
32 |     "python": "python>=3.6.0",
33 |     "recommonmark": "recommonmark",
34 |     "regex": "regex!=2019.12.17",
35 |     "requests": "requests",
36 |     "sacremoses": "sacremoses",
37 |     "scikit-learn": "scikit-learn",
38 |     "sentencepiece": "sentencepiece==0.1.91",
39 |     "sphinx-copybutton": "sphinx-copybutton",
40 |     "sphinx-markdown-tables": "sphinx-markdown-tables",
41 |     "sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3",
42 |     "sphinx": "sphinx==3.2.1",
43 |     "starlette": "starlette",
44 |     "tensorflow-cpu": "tensorflow-cpu>=2.3",
45 |     "tensorflow": "tensorflow>=2.3",
46 |     "timeout-decorator": "timeout-decorator",
47 |     "tokenizers": "tokenizers==0.9.4",
48 |     "torch": "torch>=1.0",
49 |     "tqdm": "tqdm>=4.27",
50 |     "unidic": "unidic>=1.0.2",
51 |     "unidic_lite": "unidic_lite>=1.0.7",
52 |     "uvicorn": "uvicorn",
53 | }
54 | 


--------------------------------------------------------------------------------
/transformers/commands/download.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from argparse import ArgumentParser
16 | 
17 | from . import BaseTransformersCLICommand
18 | 
19 | 
20 | def download_command_factory(args):
21 |     return DownloadCommand(args.model, args.cache_dir, args.force)
22 | 
23 | 
24 | class DownloadCommand(BaseTransformersCLICommand):
25 |     @staticmethod
26 |     def register_subcommand(parser: ArgumentParser):
27 |         download_parser = parser.add_parser("download")
28 |         download_parser.add_argument(
29 |             "--cache-dir", type=str, default=None, help="Path to location to store the models"
30 |         )
31 |         download_parser.add_argument(
32 |             "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
33 |         )
34 |         download_parser.add_argument("model", type=str, help="Name of the model to download")
35 |         download_parser.set_defaults(func=download_command_factory)
36 | 
37 |     def __init__(self, model: str, cache: str, force: bool):
38 |         self._model = model
39 |         self._cache = cache
40 |         self._force = force
41 | 
42 |     def run(self):
43 |         from ..models.auto import AutoModel, AutoTokenizer
44 | 
45 |         AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
46 |         AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
47 | 


--------------------------------------------------------------------------------
/transformers/commands/transformers_cli.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from argparse import ArgumentParser
17 | 
18 | from .add_new_model import AddNewModelCommand
19 | from .convert import ConvertCommand
20 | from .download import DownloadCommand
21 | from .env import EnvironmentCommand
22 | from .lfs import LfsCommands
23 | from .run import RunCommand
24 | from .serving import ServeCommand
25 | from .user import UserCommands
26 | 
27 | 
28 | def main():
29 |     parser = ArgumentParser("Transformers CLI tool", usage="transformers-cli <command> [<args>]")
30 |     commands_parser = parser.add_subparsers(help="transformers-cli command helpers")
31 | 
32 |     # Register commands
33 |     ConvertCommand.register_subcommand(commands_parser)
34 |     DownloadCommand.register_subcommand(commands_parser)
35 |     EnvironmentCommand.register_subcommand(commands_parser)
36 |     RunCommand.register_subcommand(commands_parser)
37 |     ServeCommand.register_subcommand(commands_parser)
38 |     UserCommands.register_subcommand(commands_parser)
39 |     AddNewModelCommand.register_subcommand(commands_parser)
40 |     LfsCommands.register_subcommand(commands_parser)
41 | 
42 |     # Let's go
43 |     args = parser.parse_args()
44 | 
45 |     if not hasattr(args, "func"):
46 |         parser.print_help()
47 |         exit(1)
48 | 
49 |     # Run
50 |     service = args.func(args)
51 |     service.run()
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/transformers/models/led/tokenization_led.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for LED."""
16 | from ...utils import logging
17 | from ..bart.tokenization_bart import BartTokenizer
18 | 
19 | 
20 | logger = logging.get_logger(__name__)
21 | 
22 | PRETRAINED_VOCAB_FILES_MAP = {
23 |     "vocab_file": {
24 |         "allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/vocab.json",
25 |     },
26 |     "merges_file": {
27 |         "allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/merges.txt",
28 |     },
29 |     "tokenizer_file": {
30 |         "allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/tokenizer.json",
31 |     },
32 | }
33 | 
34 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
35 |     "allenai/led-base-16384": 16384,
36 | }
37 | 
38 | 
39 | class LEDTokenizer(BartTokenizer):
40 |     """
41 |     Construct a LED tokenizer.
42 | 
43 |     :class:`~transformers.LEDTokenizer` is identical to :class:`~transformers.BartTokenizer` and runs end-to-end
44 |     tokenization: punctuation splitting and wordpiece.
45 | 
46 |     Refer to superclass :class:`~transformers.BartTokenizer` for usage examples and documentation concerning
47 |     parameters.
48 |     """
49 | 
50 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
51 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
52 | 


--------------------------------------------------------------------------------
/transformers/models/retribert/tokenization_retribert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for RetriBERT."""
16 | 
17 | from ...utils import logging
18 | from ..bert.tokenization_bert import BertTokenizer
19 | 
20 | 
21 | logger = logging.get_logger(__name__)
22 | 
23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
24 | 
25 | PRETRAINED_VOCAB_FILES_MAP = {
26 |     "vocab_file": {
27 |         "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
28 |     }
29 | }
30 | 
31 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
32 |     "yjernite/retribert-base-uncased": 512,
33 | }
34 | 
35 | 
36 | PRETRAINED_INIT_CONFIGURATION = {
37 |     "yjernite/retribert-base-uncased": {"do_lower_case": True},
38 | }
39 | 
40 | 
41 | class RetriBertTokenizer(BertTokenizer):
42 |     r"""
43 |     Constructs a RetriBERT tokenizer.
44 | 
45 |     :class:`~transformers.RetroBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
46 |     tokenization: punctuation splitting and wordpiece.
47 | 
48 |     Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
49 |     parameters.
50 |     """
51 | 
52 |     vocab_files_names = VOCAB_FILES_NAMES
53 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
54 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
55 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
56 |     model_input_names = ["attention_mask"]
57 | 


--------------------------------------------------------------------------------
/transformers/models/xlm_roberta/configuration_xlm_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ XLM-RoBERTa configuration """
17 | 
18 | from ...utils import logging
19 | from ..roberta.configuration_roberta import RobertaConfig
20 | 
21 | 
22 | logger = logging.get_logger(__name__)
23 | 
24 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25 |     "xlm-roberta-base": "https://huggingface.co/xlm-roberta-base/resolve/main/config.json",
26 |     "xlm-roberta-large": "https://huggingface.co/xlm-roberta-large/resolve/main/config.json",
27 |     "xlm-roberta-large-finetuned-conll02-dutch": "https://huggingface.co/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/config.json",
28 |     "xlm-roberta-large-finetuned-conll02-spanish": "https://huggingface.co/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/config.json",
29 |     "xlm-roberta-large-finetuned-conll03-english": "https://huggingface.co/xlm-roberta-large-finetuned-conll03-english/resolve/main/config.json",
30 |     "xlm-roberta-large-finetuned-conll03-german": "https://huggingface.co/xlm-roberta-large-finetuned-conll03-german/resolve/main/config.json",
31 | }
32 | 
33 | 
34 | class XLMRobertaConfig(RobertaConfig):
35 |     """
36 |     This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate
37 |     documentation alongside usage examples.
38 |     """
39 | 
40 |     model_type = "xlm-roberta"
41 | 


--------------------------------------------------------------------------------
/transformers/models/barthez/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_sentencepiece_available, is_tokenizers_available
22 | 
23 | 
24 | _import_structure = {}
25 | 
26 | if is_sentencepiece_available():
27 |     _import_structure["tokenization_barthez"] = ["BarthezTokenizer"]
28 | 
29 | if is_tokenizers_available():
30 |     _import_structure["tokenization_barthez_fast"] = ["BarthezTokenizerFast"]
31 | 
32 | 
33 | if TYPE_CHECKING:
34 | 
35 |     if is_sentencepiece_available():
36 |         from .tokenization_barthez import BarthezTokenizer
37 | 
38 |     if is_tokenizers_available():
39 |         from .tokenization_barthez_fast import BarthezTokenizerFast
40 | 
41 | else:
42 |     import importlib
43 |     import os
44 |     import sys
45 | 
46 |     class _LazyModule(_BaseLazyModule):
47 |         """
48 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
49 |         """
50 | 
51 |         __file__ = globals()["__file__"]
52 |         __path__ = [os.path.dirname(__file__)]
53 | 
54 |         def _get_module(self, module_name: str):
55 |             return importlib.import_module("." + module_name, self.__name__)
56 | 
57 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Finetune-Transformers
 2 | 
 3 | ## Finetuning and evaluating transformers on summarization task
 4 | The main objective of this module is to fine-tune and evaluate a model (pre-trained on a large-scale dataset) on domain-specific data. Finetuning will improve the performance of the model on domain specific tasks. The pre-trained models can be finetuned on a number of downstream tasks based on their architecture. 
 5 | Here, I have taken an example of finetuning sequence-to-sequence models such as T5, BART, Pegasus on an abstractive summarization task using the Trainer API from [Hugging Face](https://huggingface.co/transformers/main_classes/trainer.html).
 6 | 
 7 | * A number of pre-trained models can be finetuned such as:
 8 |     * T5 (small, base, large, 3B, 11B)
 9 |     * BART (base, large-cnn, large-mnli)
10 |     * Longformer Encoder Decoder (allenai/led-base-16384, allenai/led-large-16384)
11 |     * Pegasus (large, xsum, multi_news)
12 | 
13 | Checkout [pre-trained models](https://huggingface.co/models) to see the checkpoints available for each of them.
14 | ***
15 | ## Script
16 | Finetuning with custom dataset placed at [`data/`](https://github.com/nsi319/Finetune-Transformers/tree/main/data):
17 | 
18 | ```bash
19 | python run.py \
20 |     --model_name_or_path facebook/bart-base \
21 |     --train_file data/news_summary_train_small.csv \
22 |     --validation_file data/news_summary_valid_small.csv \
23 |     --text_column Text \
24 |     --summary_column Summary \
25 |     --output_dir output/ \
26 |     --overwrite_output_dir \
27 |     --do_train \
28 |     --do_eval \
29 |     --num_beams=3 \
30 |     --min_summ_length=100 \     
31 |     --max_summ_length=250 \   
32 |     --length_penalty=1.0 \
33 |     --per_device_train_batch_size=4 \
34 |     --per_device_eval_batch_size=4 \
35 |     --predict_with_generate 
36 | ```
37 | 
38 | To see all the possible command line options, run:
39 | 
40 | ```bash
41 | python run.py --help
42 | ```
43 | If you are using **Google Colab**, Open [`colab/finetuning.ipynb`](https://github.com/nsi319/Finetune-Transformers/blob/main/colab/finetuning.ipynb) in Colab, save a copy in Drive and follow the instructions.
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/transformers/models/bart/tokenization_bart.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from ...utils import logging
17 | from ..roberta.tokenization_roberta import RobertaTokenizer
18 | 
19 | 
20 | logger = logging.get_logger(__name__)
21 | 
22 | 
23 | # vocab and merges same as roberta
24 | vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
25 | merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
26 | _all_bart_models = [
27 |     "facebook/bart-base",
28 |     "facebook/bart-large",
29 |     "facebook/bart-large-mnli",
30 |     "facebook/bart-large-cnn",
31 |     "facebook/bart-large-xsum",
32 |     "yjernite/bart_eli5",
33 |     # This is not exhaustive: see https://huggingface.co/models?filter=bart
34 | ]
35 | 
36 | 
37 | class BartTokenizer(RobertaTokenizer):
38 |     r"""
39 |     Construct a BART tokenizer.
40 | 
41 |     :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new
42 |     :meth:`~transformers.BartTokenizer.prepare_seq2seq_batch`
43 | 
44 |     Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the
45 |     initialization parameters and other methods.
46 |     """
47 |     # merges and vocab same as Roberta
48 |     max_model_input_sizes = {m: 1024 for m in _all_bart_models}
49 |     pretrained_vocab_files_map = {
50 |         "vocab_file": {m: vocab_url for m in _all_bart_models},
51 |         "merges_file": {m: merges_url for m in _all_bart_models},
52 |     }
53 | 


--------------------------------------------------------------------------------
/transformers/models/fsmt/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_fsmt": ["FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FSMTConfig"],
26 |     "tokenization_fsmt": ["FSMTTokenizer"],
27 | }
28 | 
29 | if is_torch_available():
30 |     _import_structure["modeling_fsmt"] = ["FSMTForConditionalGeneration", "FSMTModel", "PretrainedFSMTModel"]
31 | 
32 | 
33 | if TYPE_CHECKING:
34 |     from .configuration_fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTConfig
35 |     from .tokenization_fsmt import FSMTTokenizer
36 | 
37 |     if is_torch_available():
38 |         from .modeling_fsmt import FSMTForConditionalGeneration, FSMTModel, PretrainedFSMTModel
39 | 
40 | else:
41 |     import importlib
42 |     import os
43 |     import sys
44 | 
45 |     class _LazyModule(_BaseLazyModule):
46 |         """
47 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
48 |         """
49 | 
50 |         __file__ = globals()["__file__"]
51 |         __path__ = [os.path.dirname(__file__)]
52 | 
53 |         def _get_module(self, module_name: str):
54 |             return importlib.import_module("." + module_name, self.__name__)
55 | 
56 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
57 | 


--------------------------------------------------------------------------------
/transformers/models/rag/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_rag": ["RagConfig"],
26 |     "retrieval_rag": ["RagRetriever"],
27 |     "tokenization_rag": ["RagTokenizer"],
28 | }
29 | 
30 | if is_torch_available():
31 |     _import_structure["modeling_rag"] = ["RagModel", "RagSequenceForGeneration", "RagTokenForGeneration"]
32 | 
33 | 
34 | if TYPE_CHECKING:
35 |     from .configuration_rag import RagConfig
36 |     from .retrieval_rag import RagRetriever
37 |     from .tokenization_rag import RagTokenizer
38 | 
39 |     if is_torch_available():
40 |         from .modeling_rag import RagModel, RagSequenceForGeneration, RagTokenForGeneration
41 | 
42 | else:
43 |     import importlib
44 |     import os
45 |     import sys
46 | 
47 |     class _LazyModule(_BaseLazyModule):
48 |         """
49 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
50 |         """
51 | 
52 |         __file__ = globals()["__file__"]
53 |         __path__ = [os.path.dirname(__file__)]
54 | 
55 |         def _get_module(self, module_name: str):
56 |             return importlib.import_module("." + module_name, self.__name__)
57 | 
58 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
59 | 


--------------------------------------------------------------------------------
/transformers/models/led/tokenization_led_fast.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for LED."""
16 | from ...utils import logging
17 | from ..bart.tokenization_bart_fast import BartTokenizerFast
18 | from .tokenization_led import LEDTokenizer
19 | 
20 | 
21 | logger = logging.get_logger(__name__)
22 | 
23 | PRETRAINED_VOCAB_FILES_MAP = {
24 |     "vocab_file": {
25 |         "allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/vocab.json",
26 |     },
27 |     "merges_file": {
28 |         "allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/merges.txt",
29 |     },
30 |     "tokenizer_file": {
31 |         "allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/tokenizer.json",
32 |     },
33 | }
34 | 
35 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
36 |     "allenai/led-base-16384": 16384,
37 | }
38 | 
39 | 
40 | class LEDTokenizerFast(BartTokenizerFast):
41 |     r"""
42 |     Construct a "fast" LED tokenizer (backed by HuggingFace's `tokenizers` library).
43 | 
44 |     :class:`~transformers.LEDTokenizerFast` is identical to :class:`~transformers.BartTokenizerFast` and runs
45 |     end-to-end tokenization: punctuation splitting and wordpiece.
46 | 
47 |     Refer to superclass :class:`~transformers.BartTokenizerFast` for usage examples and documentation concerning
48 |     parameters.
49 |     """
50 | 
51 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
52 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
53 |     slow_tokenizer_class = LEDTokenizer
54 | 


--------------------------------------------------------------------------------
/transformers/models/layoutlm/tokenization_layoutlm.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ Tokenization class for model LayoutLM."""
16 | 
17 | 
18 | from ...utils import logging
19 | from ..bert.tokenization_bert import BertTokenizer
20 | 
21 | 
22 | logger = logging.get_logger(__name__)
23 | 
24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
25 | 
26 | PRETRAINED_VOCAB_FILES_MAP = {
27 |     "vocab_file": {
28 |         "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
29 |         "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
30 |     }
31 | }
32 | 
33 | 
34 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
35 |     "microsoft/layoutlm-base-uncased": 512,
36 |     "microsoft/layoutlm-large-uncased": 512,
37 | }
38 | 
39 | 
40 | PRETRAINED_INIT_CONFIGURATION = {
41 |     "microsoft/layoutlm-base-uncased": {"do_lower_case": True},
42 |     "microsoft/layoutlm-large-uncased": {"do_lower_case": True},
43 | }
44 | 
45 | 
46 | class LayoutLMTokenizer(BertTokenizer):
47 |     r"""
48 |     Constructs a LayoutLM tokenizer.
49 | 
50 |     :class:`~transformers.LayoutLMTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
51 |     tokenization: punctuation splitting + wordpiece.
52 | 
53 |     Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
54 |     parameters.
55 |     """
56 | 
57 |     vocab_files_names = VOCAB_FILES_NAMES
58 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
59 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
60 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
61 | 


--------------------------------------------------------------------------------
/transformers/models/mobilebert/tokenization_mobilebert_fast.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | #
 3 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """Tokenization classes for MobileBERT."""
17 | 
18 | from ...utils import logging
19 | from ..bert.tokenization_bert_fast import BertTokenizerFast
20 | from .tokenization_mobilebert import MobileBertTokenizer
21 | 
22 | 
23 | logger = logging.get_logger(__name__)
24 | 
25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
26 | 
27 | PRETRAINED_VOCAB_FILES_MAP = {
28 |     "vocab_file": {"mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/vocab.txt"},
29 |     "tokenizer_file": {
30 |         "mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/tokenizer.json"
31 |     },
32 | }
33 | 
34 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512}
35 | 
36 | 
37 | PRETRAINED_INIT_CONFIGURATION = {}
38 | 
39 | 
40 | class MobileBertTokenizerFast(BertTokenizerFast):
41 |     r"""
42 |     Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's `tokenizers` library).
43 | 
44 |     :class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
45 |     end-to-end tokenization: punctuation splitting and wordpiece.
46 | 
47 |     Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
48 |     parameters.
49 |     """
50 | 
51 |     vocab_files_names = VOCAB_FILES_NAMES
52 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
53 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
54 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
55 |     slow_tokenizer_class = MobileBertTokenizer
56 | 


--------------------------------------------------------------------------------
/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | 
17 | import torch
18 | 
19 | from transformers import BartForConditionalGeneration, MBartConfig
20 | from transformers.models.bart.convert_bart_original_pytorch_checkpoint_to_pytorch import remove_ignore_keys_
21 | 
22 | 
23 | def convert_fairseq_mbart_checkpoint_from_disk(checkpoint_path, hf_config_path="facebook/mbart-large-en-ro"):
24 |     state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
25 |     remove_ignore_keys_(state_dict)
26 |     vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]
27 |     mbart_config = MBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size)
28 |     state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
29 |     model = BartForConditionalGeneration(mbart_config)
30 |     model.model.load_state_dict(state_dict)
31 |     return model
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     parser = argparse.ArgumentParser()
36 |     # Required parameters
37 |     parser.add_argument(
38 |         "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem."
39 |     )
40 |     parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
41 |     parser.add_argument(
42 |         "--hf_config",
43 |         default="facebook/mbart-large-cc25",
44 |         type=str,
45 |         help="Which huggingface architecture to use: bart-large-xsum",
46 |     )
47 |     args = parser.parse_args()
48 |     model = convert_fairseq_mbart_checkpoint_from_disk(args.fairseq_path, hf_config_path=args.hf_config)
49 |     model.save_pretrained(args.pytorch_dump_folder_path)
50 | 


--------------------------------------------------------------------------------
/transformers/models/longformer/tokenization_longformer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from ...utils import logging
17 | from ..roberta.tokenization_roberta import RobertaTokenizer
18 | 
19 | 
20 | logger = logging.get_logger(__name__)
21 | 
22 | 
23 | # vocab and merges same as roberta
24 | vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
25 | merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
26 | _all_longformer_models = [
27 |     "allenai/longformer-base-4096",
28 |     "allenai/longformer-large-4096",
29 |     "allenai/longformer-large-4096-finetuned-triviaqa",
30 |     "allenai/longformer-base-4096-extra.pos.embd.only",
31 |     "allenai/longformer-large-4096-extra.pos.embd.only",
32 | ]
33 | 
34 | 
35 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
36 |     "allenai/longformer-base-4096": 4096,
37 |     "allenai/longformer-large-4096": 4096,
38 |     "allenai/longformer-large-4096-finetuned-triviaqa": 4096,
39 |     "allenai/longformer-base-4096-extra.pos.embd.only": 4096,
40 |     "allenai/longformer-large-4096-extra.pos.embd.only": 4096,
41 | }
42 | 
43 | 
44 | class LongformerTokenizer(RobertaTokenizer):
45 |     r"""
46 |     Construct a Longformer tokenizer.
47 | 
48 |     :class:`~transformers.LongformerTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to the
49 |     superclass for usage examples and documentation concerning parameters.
50 |     """
51 |     # merges and vocab same as Roberta
52 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
53 |     pretrained_vocab_files_map = {
54 |         "vocab_file": {m: vocab_url for m in _all_longformer_models},
55 |         "merges_file": {m: merges_url for m in _all_longformer_models},
56 |     }
57 | 


--------------------------------------------------------------------------------
/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert T5 checkpoint."""
16 | 
17 | 
18 | import argparse
19 | 
20 | from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5
21 | from transformers.utils import logging
22 | 
23 | 
24 | logging.set_verbosity_info()
25 | 
26 | 
27 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
28 |     # Initialise PyTorch model
29 |     config = T5Config.from_json_file(config_file)
30 |     print("Building PyTorch model from configuration: {}".format(str(config)))
31 |     model = T5ForConditionalGeneration(config)
32 | 
33 |     # Load weights from tf checkpoint
34 |     load_tf_weights_in_t5(model, config, tf_checkpoint_path)
35 | 
36 |     # Save pytorch-model
37 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
38 |     model.save_pretrained(pytorch_dump_path)
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     parser = argparse.ArgumentParser()
43 |     # Required parameters
44 |     parser.add_argument(
45 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
46 |     )
47 |     parser.add_argument(
48 |         "--config_file",
49 |         default=None,
50 |         type=str,
51 |         required=True,
52 |         help="The config json file corresponding to the pre-trained T5 model. \n"
53 |         "This specifies the model architecture.",
54 |     )
55 |     parser.add_argument(
56 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
57 |     )
58 |     args = parser.parse_args()
59 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
60 | 


--------------------------------------------------------------------------------
/transformers/utils/model_parallel_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from math import ceil
17 | 
18 | 
19 | def assert_device_map(device_map, num_blocks):
20 |     blocks = list(range(0, num_blocks))
21 | 
22 |     device_map_blocks = [item for sublist in list(device_map.values()) for item in sublist]
23 | 
24 |     # Duplicate check
25 |     duplicate_blocks = []
26 |     for i in device_map_blocks:
27 |         if device_map_blocks.count(i) > 1 and i not in duplicate_blocks:
28 |             duplicate_blocks.append(i)
29 |     # Missing blocks
30 |     missing_blocks = [i for i in blocks if i not in device_map_blocks]
31 |     extra_blocks = [i for i in device_map_blocks if i not in blocks]
32 | 
33 |     assert len(duplicate_blocks) == 0, (
34 |         "Duplicate attention blocks specified in device_map. Attention blocks must be specified to one device. These "
35 |         "attention blocks were specified more than once: " + str(duplicate_blocks)
36 |     )
37 |     assert len(missing_blocks) == 0, (
38 |         "There are attention blocks for this model that are not specified in the device_map. Add these attention "
39 |         "blocks to a device on the device_map: " + str(missing_blocks)
40 |     )
41 |     assert (
42 |         len(extra_blocks) == 0
43 |     ), "The device_map contains more attention blocks than this model has. Remove these from the device_map:" + str(
44 |         extra_blocks
45 |     )
46 | 
47 | 
48 | def get_device_map(n_layers, devices):
49 |     """Returns a dictionary of layers distributed evenly across all devices."""
50 |     layers = list(range(n_layers))
51 |     n_blocks = int(ceil(n_layers / len(devices)))
52 |     layers_list = list(layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks))
53 | 
54 |     return dict(zip(devices, layers_list))
55 | 


--------------------------------------------------------------------------------
/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert LXMERT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | 
20 | import torch
21 | 
22 | from transformers import LxmertConfig, LxmertForPreTraining, load_tf_weights_in_lxmert
23 | from transformers.utils import logging
24 | 
25 | 
26 | logging.set_verbosity_info()
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = LxmertConfig.from_json_file(config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = LxmertForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_lxmert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | 
20 | import torch
21 | 
22 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
23 | from transformers.utils import logging
24 | 
25 | 
26 | logging.set_verbosity_info()
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = BertConfig.from_json_file(bert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = BertForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_bert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--bert_config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained BERT model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert ALBERT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | 
20 | import torch
21 | 
22 | from transformers import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert
23 | from transformers.utils import logging
24 | 
25 | 
26 | logging.set_verbosity_info()
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = AlbertConfig.from_json_file(albert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = AlbertForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_albert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--albert_config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained ALBERT model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/transformers/models/mobilebert/convert_mobilebert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | 
17 | import torch
18 | 
19 | from transformers import MobileBertConfig, MobileBertForPreTraining, load_tf_weights_in_mobilebert
20 | from transformers.utils import logging
21 | 
22 | 
23 | logging.set_verbosity_info()
24 | 
25 | 
26 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, mobilebert_config_file, pytorch_dump_path):
27 |     # Initialise PyTorch model
28 |     config = MobileBertConfig.from_json_file(mobilebert_config_file)
29 |     print("Building PyTorch model from configuration: {}".format(str(config)))
30 |     model = MobileBertForPreTraining(config)
31 |     # Load weights from tf checkpoint
32 |     model = load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path)
33 |     # Save pytorch-model
34 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
35 |     torch.save(model.state_dict(), pytorch_dump_path)
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     parser = argparse.ArgumentParser()
40 |     # Required parameters
41 |     parser.add_argument(
42 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
43 |     )
44 |     parser.add_argument(
45 |         "--mobilebert_config_file",
46 |         default=None,
47 |         type=str,
48 |         required=True,
49 |         help="The config json file corresponding to the pre-trained MobileBERT model. \n"
50 |         "This specifies the model architecture.",
51 |     )
52 |     parser.add_argument(
53 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
54 |     )
55 |     args = parser.parse_args()
56 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.mobilebert_config_file, args.pytorch_dump_path)
57 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/transformers/models/retribert/tokenization_retribert_fast.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for RetriBERT."""
16 | 
17 | from ...utils import logging
18 | from ..bert.tokenization_bert_fast import BertTokenizerFast
19 | from .tokenization_retribert import RetriBertTokenizer
20 | 
21 | 
22 | logger = logging.get_logger(__name__)
23 | 
24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
25 | 
26 | PRETRAINED_VOCAB_FILES_MAP = {
27 |     "vocab_file": {
28 |         "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
29 |     },
30 |     "tokenizer_file": {
31 |         "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
32 |     },
33 | }
34 | 
35 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
36 |     "yjernite/retribert-base-uncased": 512,
37 | }
38 | 
39 | 
40 | PRETRAINED_INIT_CONFIGURATION = {
41 |     "yjernite/retribert-base-uncased": {"do_lower_case": True},
42 | }
43 | 
44 | 
45 | class RetriBertTokenizerFast(BertTokenizerFast):
46 |     r"""
47 |     Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's `tokenizers` library).
48 | 
49 |     :class:`~transformers.RetriBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
50 |     end-to-end tokenization: punctuation splitting and wordpiece.
51 | 
52 |     Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
53 |     parameters.
54 |     """
55 | 
56 |     vocab_files_names = VOCAB_FILES_NAMES
57 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
58 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
59 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
60 |     slow_tokenizer_class = RetriBertTokenizer
61 |     model_input_names = ["attention_mask"]
62 | 


--------------------------------------------------------------------------------
/transformers/models/tapas/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_tapas": ["TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP", "TapasConfig"],
26 |     "tokenization_tapas": ["TapasTokenizer"],
27 | }
28 | 
29 | if is_torch_available():
30 |     _import_structure["modeling_tapas"] = [
31 |         "TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST",
32 |         "TapasForMaskedLM",
33 |         "TapasForQuestionAnswering",
34 |         "TapasForSequenceClassification",
35 |         "TapasModel",
36 |     ]
37 | 
38 | 
39 | if TYPE_CHECKING:
40 |     from .configuration_tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig
41 |     from .tokenization_tapas import TapasTokenizer
42 | 
43 |     if is_torch_available():
44 |         from .modeling_tapas import (
45 |             TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST,
46 |             TapasForMaskedLM,
47 |             TapasForQuestionAnswering,
48 |             TapasForSequenceClassification,
49 |             TapasModel,
50 |         )
51 | 
52 | else:
53 |     import importlib
54 |     import os
55 |     import sys
56 | 
57 |     class _LazyModule(_BaseLazyModule):
58 |         """
59 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
60 |         """
61 | 
62 |         __file__ = globals()["__file__"]
63 |         __path__ = [os.path.dirname(__file__)]
64 | 
65 |         def _get_module(self, module_name: str):
66 |             return importlib.import_module("." + module_name, self.__name__)
67 | 
68 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
69 | 


--------------------------------------------------------------------------------
/transformers/models/bert_generation/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_sentencepiece_available, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_bert_generation": ["BertGenerationConfig"],
26 | }
27 | 
28 | if is_sentencepiece_available():
29 |     _import_structure["tokenization_bert_generation"] = ["BertGenerationTokenizer"]
30 | 
31 | if is_torch_available():
32 |     _import_structure["modeling_bert_generation"] = [
33 |         "BertGenerationDecoder",
34 |         "BertGenerationEncoder",
35 |         "load_tf_weights_in_bert_generation",
36 |     ]
37 | 
38 | 
39 | if TYPE_CHECKING:
40 |     from .configuration_bert_generation import BertGenerationConfig
41 | 
42 |     if is_sentencepiece_available():
43 |         from .tokenization_bert_generation import BertGenerationTokenizer
44 | 
45 |     if is_torch_available():
46 |         from .modeling_bert_generation import (
47 |             BertGenerationDecoder,
48 |             BertGenerationEncoder,
49 |             load_tf_weights_in_bert_generation,
50 |         )
51 | 
52 | else:
53 |     import importlib
54 |     import os
55 |     import sys
56 | 
57 |     class _LazyModule(_BaseLazyModule):
58 |         """
59 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
60 |         """
61 | 
62 |         __file__ = globals()["__file__"]
63 |         __path__ = [os.path.dirname(__file__)]
64 | 
65 |         def _get_module(self, module_name: str):
66 |             return importlib.import_module("." + module_name, self.__name__)
67 | 
68 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
69 | 


--------------------------------------------------------------------------------
/transformers/models/squeezebert/tokenization_squeezebert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for SqueezeBERT."""
16 | 
17 | from ...utils import logging
18 | from ..bert.tokenization_bert import BertTokenizer
19 | 
20 | 
21 | logger = logging.get_logger(__name__)
22 | 
23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
24 | 
25 | PRETRAINED_VOCAB_FILES_MAP = {
26 |     "vocab_file": {
27 |         "squeezebert/squeezebert-uncased": "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/vocab.txt",
28 |         "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/vocab.txt",
29 |         "squeezebert/squeezebert-mnli-headless": "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/vocab.txt",
30 |     }
31 | }
32 | 
33 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
34 |     "squeezebert/squeezebert-uncased": 512,
35 |     "squeezebert/squeezebert-mnli": 512,
36 |     "squeezebert/squeezebert-mnli-headless": 512,
37 | }
38 | 
39 | 
40 | PRETRAINED_INIT_CONFIGURATION = {
41 |     "squeezebert/squeezebert-uncased": {"do_lower_case": True},
42 |     "squeezebert/squeezebert-mnli": {"do_lower_case": True},
43 |     "squeezebert/squeezebert-mnli-headless": {"do_lower_case": True},
44 | }
45 | 
46 | 
47 | class SqueezeBertTokenizer(BertTokenizer):
48 |     r"""
49 |     Constructs a SqueezeBert tokenizer.
50 | 
51 |     :class:`~transformers.SqueezeBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
52 |     tokenization: punctuation splitting + wordpiece.
53 | 
54 |     Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
55 |     parameters.
56 |     """
57 | 
58 |     vocab_files_names = VOCAB_FILES_NAMES
59 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
60 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
61 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
62 | 


--------------------------------------------------------------------------------
/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert Funnel checkpoint."""
16 | 
17 | 
18 | import argparse
19 | 
20 | import torch
21 | 
22 | from transformers import FunnelBaseModel, FunnelConfig, FunnelModel, load_tf_weights_in_funnel
23 | from transformers.utils import logging
24 | 
25 | 
26 | logging.set_verbosity_info()
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, base_model):
30 |     # Initialise PyTorch model
31 |     config = FunnelConfig.from_json_file(config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = FunnelBaseModel(config) if base_model else FunnelModel(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_funnel(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     parser.add_argument(
61 |         "--base_model", action="store_true", help="Whether you want just the base model (no decoder) or not."
62 |     )
63 |     args = parser.parse_args()
64 |     convert_tf_checkpoint_to_pytorch(
65 |         args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.base_model
66 |     )
67 | 


--------------------------------------------------------------------------------
/transformers/models/retribert/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_tokenizers_available, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig"],
26 |     "tokenization_retribert": ["RetriBertTokenizer"],
27 | }
28 | 
29 | if is_tokenizers_available():
30 |     _import_structure["tokenization_retribert_fast"] = ["RetriBertTokenizerFast"]
31 | 
32 | if is_torch_available():
33 |     _import_structure["modeling_retribert"] = [
34 |         "RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
35 |         "RetriBertModel",
36 |         "RetriBertPreTrainedModel",
37 |     ]
38 | 
39 | 
40 | if TYPE_CHECKING:
41 |     from .configuration_retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig
42 |     from .tokenization_retribert import RetriBertTokenizer
43 | 
44 |     if is_tokenizers_available():
45 |         from .tokenization_retribert_fast import RetriBertTokenizerFast
46 | 
47 |     if is_torch_available():
48 |         from .modeling_retribert import (
49 |             RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
50 |             RetriBertModel,
51 |             RetriBertPreTrainedModel,
52 |         )
53 | 
54 | else:
55 |     import importlib
56 |     import os
57 |     import sys
58 | 
59 |     class _LazyModule(_BaseLazyModule):
60 |         """
61 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
62 |         """
63 | 
64 |         __file__ = globals()["__file__"]
65 |         __path__ = [os.path.dirname(__file__)]
66 | 
67 |         def _get_module(self, module_name: str):
68 |             return importlib.import_module("." + module_name, self.__name__)
69 | 
70 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
71 | 


--------------------------------------------------------------------------------
/transformers/models/deberta/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig"],
26 |     "tokenization_deberta": ["DebertaTokenizer"],
27 | }
28 | 
29 | if is_torch_available():
30 |     _import_structure["modeling_deberta"] = [
31 |         "DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
32 |         "DebertaForSequenceClassification",
33 |         "DebertaModel",
34 |         "DebertaForMaskedLM",
35 |         "DebertaPreTrainedModel",
36 |         "DebertaForTokenClassification",
37 |         "DebertaForQuestionAnswering",
38 |     ]
39 | 
40 | 
41 | if TYPE_CHECKING:
42 |     from .configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig
43 |     from .tokenization_deberta import DebertaTokenizer
44 | 
45 |     if is_torch_available():
46 |         from .modeling_deberta import (
47 |             DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
48 |             DebertaForMaskedLM,
49 |             DebertaForQuestionAnswering,
50 |             DebertaForSequenceClassification,
51 |             DebertaForTokenClassification,
52 |             DebertaModel,
53 |             DebertaPreTrainedModel,
54 |         )
55 | 
56 | else:
57 |     import importlib
58 |     import os
59 |     import sys
60 | 
61 |     class _LazyModule(_BaseLazyModule):
62 |         """
63 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
64 |         """
65 | 
66 |         __file__ = globals()["__file__"]
67 |         __path__ = [os.path.dirname(__file__)]
68 | 
69 |         def _get_module(self, module_name: str):
70 |             return importlib.import_module("." + module_name, self.__name__)
71 | 
72 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
73 | 


--------------------------------------------------------------------------------
/transformers/models/longformer/tokenization_longformer_fast.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from ...utils import logging
17 | from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast
18 | from .tokenization_longformer import LongformerTokenizer
19 | 
20 | 
21 | logger = logging.get_logger(__name__)
22 | 
23 | 
24 | # vocab and merges same as roberta
25 | vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
26 | merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
27 | tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json"
28 | _all_longformer_models = [
29 |     "allenai/longformer-base-4096",
30 |     "allenai/longformer-large-4096",
31 |     "allenai/longformer-large-4096-finetuned-triviaqa",
32 |     "allenai/longformer-base-4096-extra.pos.embd.only",
33 |     "allenai/longformer-large-4096-extra.pos.embd.only",
34 | ]
35 | 
36 | 
37 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
38 |     "allenai/longformer-base-4096": 4096,
39 |     "allenai/longformer-large-4096": 4096,
40 |     "allenai/longformer-large-4096-finetuned-triviaqa": 4096,
41 |     "allenai/longformer-base-4096-extra.pos.embd.only": 4096,
42 |     "allenai/longformer-large-4096-extra.pos.embd.only": 4096,
43 | }
44 | 
45 | 
46 | class LongformerTokenizerFast(RobertaTokenizerFast):
47 |     r"""
48 |     Construct a "fast" Longformer tokenizer (backed by HuggingFace's `tokenizers` library).
49 | 
50 |     :class:`~transformers.LongformerTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer
51 |     to the superclass for usage examples and documentation concerning parameters.
52 |     """
53 |     # merges and vocab same as Roberta
54 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
55 |     pretrained_vocab_files_map = {
56 |         "vocab_file": {m: vocab_url for m in _all_longformer_models},
57 |         "merges_file": {m: merges_url for m in _all_longformer_models},
58 |         "tokenizer_file": {m: tokenizer_url for m in _all_longformer_models},
59 |     }
60 |     slow_tokenizer_class = LongformerTokenizer
61 | 


--------------------------------------------------------------------------------
/transformers/models/prophetnet/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_prophetnet": ["PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ProphetNetConfig"],
26 |     "tokenization_prophetnet": ["ProphetNetTokenizer"],
27 | }
28 | 
29 | if is_torch_available():
30 |     _import_structure["modeling_prophetnet"] = [
31 |         "PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST",
32 |         "ProphetNetDecoder",
33 |         "ProphetNetEncoder",
34 |         "ProphetNetForCausalLM",
35 |         "ProphetNetForConditionalGeneration",
36 |         "ProphetNetModel",
37 |         "ProphetNetPreTrainedModel",
38 |     ]
39 | 
40 | 
41 | if TYPE_CHECKING:
42 |     from .configuration_prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig
43 |     from .tokenization_prophetnet import ProphetNetTokenizer
44 | 
45 |     if is_torch_available():
46 |         from .modeling_prophetnet import (
47 |             PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
48 |             ProphetNetDecoder,
49 |             ProphetNetEncoder,
50 |             ProphetNetForCausalLM,
51 |             ProphetNetForConditionalGeneration,
52 |             ProphetNetModel,
53 |             ProphetNetPreTrainedModel,
54 |         )
55 | 
56 | else:
57 |     import importlib
58 |     import os
59 |     import sys
60 | 
61 |     class _LazyModule(_BaseLazyModule):
62 |         """
63 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
64 |         """
65 | 
66 |         __file__ = globals()["__file__"]
67 |         __path__ = [os.path.dirname(__file__)]
68 | 
69 |         def _get_module(self, module_name: str):
70 |             return importlib.import_module("." + module_name, self.__name__)
71 | 
72 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
73 | 


--------------------------------------------------------------------------------
/transformers/models/layoutlm/tokenization_layoutlm_fast.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ Tokenization class for model LayoutLM."""
16 | 
17 | 
18 | from ...utils import logging
19 | from ..bert.tokenization_bert_fast import BertTokenizerFast
20 | from .tokenization_layoutlm import LayoutLMTokenizer
21 | 
22 | 
23 | logger = logging.get_logger(__name__)
24 | 
25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
26 | 
27 | PRETRAINED_VOCAB_FILES_MAP = {
28 |     "vocab_file": {
29 |         "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
30 |         "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
31 |     },
32 |     "tokenizer_file": {
33 |         "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
34 |         "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/tokenizer.json",
35 |     },
36 | }
37 | 
38 | 
39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
40 |     "microsoft/layoutlm-base-uncased": 512,
41 |     "microsoft/layoutlm-large-uncased": 512,
42 | }
43 | 
44 | 
45 | PRETRAINED_INIT_CONFIGURATION = {
46 |     "microsoft/layoutlm-base-uncased": {"do_lower_case": True},
47 |     "microsoft/layoutlm-large-uncased": {"do_lower_case": True},
48 | }
49 | 
50 | 
51 | class LayoutLMTokenizerFast(BertTokenizerFast):
52 |     r"""
53 |     Constructs a "Fast" LayoutLMTokenizer.
54 | 
55 |     :class:`~transformers.LayoutLMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
56 |     end-to-end tokenization: punctuation splitting + wordpiece.
57 | 
58 |     Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
59 |     parameters.
60 |     """
61 | 
62 |     vocab_files_names = VOCAB_FILES_NAMES
63 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
64 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
65 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
66 |     slow_tokenizer_class = LayoutLMTokenizer
67 | 


--------------------------------------------------------------------------------
/transformers/models/blenderbot/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_tf_available, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_blenderbot": ["BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BlenderbotConfig"],
26 |     "tokenization_blenderbot": ["BlenderbotTokenizer"],
27 | }
28 | 
29 | if is_torch_available():
30 |     _import_structure["modeling_blenderbot"] = [
31 |         "BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST",
32 |         "BlenderbotForConditionalGeneration",
33 |         "BlenderbotModel",
34 |         "BlenderbotPreTrainedModel",
35 |     ]
36 | 
37 | 
38 | if is_tf_available():
39 |     _import_structure["modeling_tf_blenderbot"] = ["TFBlenderbotForConditionalGeneration", "TFBlenderbotModel"]
40 | 
41 | 
42 | if TYPE_CHECKING:
43 |     from .configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig
44 |     from .tokenization_blenderbot import BlenderbotTokenizer
45 | 
46 |     if is_torch_available():
47 |         from .modeling_blenderbot import (
48 |             BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST,
49 |             BlenderbotForConditionalGeneration,
50 |             BlenderbotModel,
51 |             BlenderbotPreTrainedModel,
52 |         )
53 | 
54 |     if is_tf_available():
55 |         from .modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration, TFBlenderbotModel
56 | 
57 | else:
58 |     import importlib
59 |     import os
60 |     import sys
61 | 
62 |     class _LazyModule(_BaseLazyModule):
63 |         """
64 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
65 |         """
66 | 
67 |         __file__ = globals()["__file__"]
68 |         __path__ = [os.path.dirname(__file__)]
69 | 
70 |         def _get_module(self, module_name: str):
71 |             return importlib.import_module("." + module_name, self.__name__)
72 | 
73 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
74 | 


--------------------------------------------------------------------------------
/transformers/models/layoutlm/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_tokenizers_available, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_layoutlm": ["LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "LayoutLMConfig"],
26 |     "tokenization_layoutlm": ["LayoutLMTokenizer"],
27 | }
28 | 
29 | if is_tokenizers_available():
30 |     _import_structure["tokenization_layoutlm_fast"] = ["LayoutLMTokenizerFast"]
31 | 
32 | if is_torch_available():
33 |     _import_structure["modeling_layoutlm"] = [
34 |         "LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST",
35 |         "LayoutLMForMaskedLM",
36 |         "LayoutLMForSequenceClassification",
37 |         "LayoutLMForTokenClassification",
38 |         "LayoutLMModel",
39 |     ]
40 | 
41 | 
42 | if TYPE_CHECKING:
43 |     from .configuration_layoutlm import LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMConfig
44 |     from .tokenization_layoutlm import LayoutLMTokenizer
45 | 
46 |     if is_tokenizers_available():
47 |         from .tokenization_layoutlm_fast import LayoutLMTokenizerFast
48 | 
49 |     if is_torch_available():
50 |         from .modeling_layoutlm import (
51 |             LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
52 |             LayoutLMForMaskedLM,
53 |             LayoutLMForSequenceClassification,
54 |             LayoutLMForTokenClassification,
55 |             LayoutLMModel,
56 |         )
57 | 
58 | else:
59 |     import importlib
60 |     import os
61 |     import sys
62 | 
63 |     class _LazyModule(_BaseLazyModule):
64 |         """
65 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
66 |         """
67 | 
68 |         __file__ = globals()["__file__"]
69 |         __path__ = [os.path.dirname(__file__)]
70 | 
71 |         def _get_module(self, module_name: str):
72 |             return importlib.import_module("." + module_name, self.__name__)
73 | 
74 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
75 | 


--------------------------------------------------------------------------------
/transformers/models/lxmert/tokenization_lxmert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from ..bert.tokenization_bert import BertTokenizer
17 | 
18 | 
19 | ####################################################
20 | # Mapping from the keyword arguments names of Tokenizer `__init__`
21 | # to file names for serializing Tokenizer instances
22 | ####################################################
23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
24 | 
25 | ####################################################
26 | # Mapping from the keyword arguments names of Tokenizer `__init__`
27 | # to pretrained vocabulary URL for all the model ids.
28 | ####################################################
29 | PRETRAINED_VOCAB_FILES_MAP = {
30 |     "vocab_file": {
31 |         "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
32 |     }
33 | }
34 | 
35 | ####################################################
36 | # Mapping from model ids to max length of inputs
37 | ####################################################
38 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
39 |     "unc-nlp/lxmert-base-uncased": 512,
40 | }
41 | ####################################################
42 | # Mapping from model ids to a dictionary of additional
43 | # keyword arguments for Tokenizer `__init__`.
44 | # To be used for checkpoint specific configurations.
45 | ####################################################
46 | PRETRAINED_INIT_CONFIGURATION = {
47 |     "unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
48 | }
49 | 
50 | 
51 | class LxmertTokenizer(BertTokenizer):
52 |     r"""
53 |     Construct an LXMERT tokenizer.
54 | 
55 |     :class:`~transformers.LxmertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
56 |     tokenization: punctuation splitting and wordpiece.
57 | 
58 |     Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
59 |     parameters.
60 |     """
61 | 
62 |     vocab_files_names = VOCAB_FILES_NAMES
63 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
64 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
65 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
66 | 


--------------------------------------------------------------------------------
/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | 
20 | import torch
21 | 
22 | from transformers import GPT2Config, GPT2Model, load_tf_weights_in_gpt2
23 | from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME
24 | from transformers.utils import logging
25 | 
26 | 
27 | logging.set_verbosity_info()
28 | 
29 | 
30 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
31 |     # Construct model
32 |     if gpt2_config_file == "":
33 |         config = GPT2Config()
34 |     else:
35 |         config = GPT2Config.from_json_file(gpt2_config_file)
36 |     model = GPT2Model(config)
37 | 
38 |     # Load weights from numpy
39 |     load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
40 | 
41 |     # Save pytorch-model
42 |     pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
43 |     pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
44 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
45 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
46 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
47 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
48 |         f.write(config.to_json_string())
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     parser = argparse.ArgumentParser()
53 |     # Required parameters
54 |     parser.add_argument(
55 |         "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     parser.add_argument(
61 |         "--gpt2_config_file",
62 |         default="",
63 |         type=str,
64 |         help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
65 |         "This specifies the model architecture.",
66 |     )
67 |     args = parser.parse_args()
68 |     convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)
69 | 


--------------------------------------------------------------------------------
/transformers/models/marian/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | from typing import TYPE_CHECKING
19 | 
20 | from ...file_utils import (
21 |     _BaseLazyModule,
22 |     is_sentencepiece_available,
23 |     is_tf_available,
24 |     is_tokenizers_available,
25 |     is_torch_available,
26 | )
27 | 
28 | 
29 | _import_structure = {
30 |     "configuration_marian": ["MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "MarianConfig"],
31 | }
32 | 
33 | if is_sentencepiece_available():
34 |     _import_structure["tokenization_marian"] = ["MarianTokenizer"]
35 | 
36 | if is_torch_available():
37 |     _import_structure["modeling_marian"] = [
38 |         "MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST",
39 |         "MarianModel",
40 |         "MarianMTModel",
41 |         "MarianPreTrainedModel",
42 |     ]
43 | 
44 | if is_tf_available():
45 |     _import_structure["modeling_tf_marian"] = ["TFMarianMTModel", "TFMarianModel"]
46 | 
47 | 
48 | if TYPE_CHECKING:
49 |     from .configuration_marian import MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP, MarianConfig
50 | 
51 |     if is_sentencepiece_available():
52 |         from .tokenization_marian import MarianTokenizer
53 | 
54 |     if is_torch_available():
55 |         from .modeling_marian import (
56 |             MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST,
57 |             MarianModel,
58 |             MarianMTModel,
59 |             MarianPreTrainedModel,
60 |         )
61 | 
62 |     if is_tf_available():
63 |         from .modeling_tf_marian import TFMarianModel, TFMarianMTModel
64 | 
65 | else:
66 |     import importlib
67 |     import os
68 |     import sys
69 | 
70 |     class _LazyModule(_BaseLazyModule):
71 |         """
72 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
73 |         """
74 | 
75 |         __file__ = globals()["__file__"]
76 |         __path__ = [os.path.dirname(__file__)]
77 | 
78 |         def _get_module(self, module_name: str):
79 |             return importlib.import_module("." + module_name, self.__name__)
80 | 
81 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
82 | 


--------------------------------------------------------------------------------
/transformers/commands/env.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import platform
16 | from argparse import ArgumentParser
17 | 
18 | from .. import __version__ as version
19 | from ..file_utils import is_tf_available, is_torch_available
20 | from . import BaseTransformersCLICommand
21 | 
22 | 
23 | def info_command_factory(_):
24 |     return EnvironmentCommand()
25 | 
26 | 
27 | class EnvironmentCommand(BaseTransformersCLICommand):
28 |     @staticmethod
29 |     def register_subcommand(parser: ArgumentParser):
30 |         download_parser = parser.add_parser("env")
31 |         download_parser.set_defaults(func=info_command_factory)
32 | 
33 |     def run(self):
34 |         pt_version = "not installed"
35 |         pt_cuda_available = "NA"
36 |         if is_torch_available():
37 |             import torch
38 | 
39 |             pt_version = torch.__version__
40 |             pt_cuda_available = torch.cuda.is_available()
41 | 
42 |         tf_version = "not installed"
43 |         tf_cuda_available = "NA"
44 |         if is_tf_available():
45 |             import tensorflow as tf
46 | 
47 |             tf_version = tf.__version__
48 |             try:
49 |                 # deprecated in v2.1
50 |                 tf_cuda_available = tf.test.is_gpu_available()
51 |             except AttributeError:
52 |                 # returns list of devices, convert to bool
53 |                 tf_cuda_available = bool(tf.config.list_physical_devices("GPU"))
54 | 
55 |         info = {
56 |             "`transformers` version": version,
57 |             "Platform": platform.platform(),
58 |             "Python version": platform.python_version(),
59 |             "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available),
60 |             "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available),
61 |             "Using GPU in script?": "<fill in>",
62 |             "Using distributed or parallel set-up in script?": "<fill in>",
63 |         }
64 | 
65 |         print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
66 |         print(self.format_dict(info))
67 | 
68 |         return info
69 | 
70 |     @staticmethod
71 |     def format_dict(d):
72 |         return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n"
73 | 


--------------------------------------------------------------------------------
/transformers/models/blenderbot_small/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | from typing import TYPE_CHECKING
19 | 
20 | from ...file_utils import _BaseLazyModule, is_tf_available, is_torch_available
21 | 
22 | 
23 | _import_structure = {
24 |     "configuration_blenderbot_small": ["BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP", "BlenderbotSmallConfig"],
25 |     "tokenization_blenderbot_small": ["BlenderbotSmallTokenizer"],
26 | }
27 | 
28 | if is_torch_available():
29 |     _import_structure["modeling_blenderbot_small"] = [
30 |         "BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST",
31 |         "BlenderbotSmallForConditionalGeneration",
32 |         "BlenderbotSmallModel",
33 |         "BlenderbotSmallPreTrainedModel",
34 |     ]
35 | 
36 | if is_tf_available():
37 |     _import_structure["modeling_tf_blenderbot_small"] = [
38 |         "TFBlenderbotSmallForConditionalGeneration",
39 |         "TFBlenderbotSmallModel",
40 |     ]
41 | 
42 | if TYPE_CHECKING:
43 |     from .configuration_blenderbot_small import BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotSmallConfig
44 |     from .tokenization_blenderbot_small import BlenderbotSmallTokenizer
45 | 
46 |     if is_torch_available():
47 |         from .modeling_blenderbot_small import (
48 |             BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST,
49 |             BlenderbotSmallForConditionalGeneration,
50 |             BlenderbotSmallModel,
51 |             BlenderbotSmallPreTrainedModel,
52 |         )
53 | 
54 |     if is_tf_available():
55 |         from .modeling_tf_blenderbot_small import TFBlenderbotSmallForConditionalGeneration, TFBlenderbotSmallModel
56 | 
57 | else:
58 |     import importlib
59 |     import os
60 |     import sys
61 | 
62 |     class _LazyModule(_BaseLazyModule):
63 |         """
64 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
65 |         """
66 | 
67 |         __file__ = globals()["__file__"]
68 |         __path__ = [os.path.dirname(__file__)]
69 | 
70 |         def _get_module(self, module_name: str):
71 |             return importlib.import_module("." + module_name, self.__name__)
72 | 
73 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
74 | 


--------------------------------------------------------------------------------
/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | 
20 | import torch
21 | 
22 | from transformers import OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt
23 | from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME
24 | from transformers.utils import logging
25 | 
26 | 
27 | logging.set_verbosity_info()
28 | 
29 | 
30 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
31 |     # Construct model
32 |     if openai_config_file == "":
33 |         config = OpenAIGPTConfig()
34 |     else:
35 |         config = OpenAIGPTConfig.from_json_file(openai_config_file)
36 |     model = OpenAIGPTModel(config)
37 | 
38 |     # Load weights from numpy
39 |     load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
40 | 
41 |     # Save pytorch-model
42 |     pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
43 |     pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
44 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
45 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
46 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
47 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
48 |         f.write(config.to_json_string())
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     parser = argparse.ArgumentParser()
53 |     # Required parameters
54 |     parser.add_argument(
55 |         "--openai_checkpoint_folder_path",
56 |         default=None,
57 |         type=str,
58 |         required=True,
59 |         help="Path to the TensorFlow checkpoint path.",
60 |     )
61 |     parser.add_argument(
62 |         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
63 |     )
64 |     parser.add_argument(
65 |         "--openai_config_file",
66 |         default="",
67 |         type=str,
68 |         help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
69 |         "This specifies the model architecture.",
70 |     )
71 |     args = parser.parse_args()
72 |     convert_openai_checkpoint_to_pytorch(
73 |         args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path
74 |     )
75 | 


--------------------------------------------------------------------------------
/transformers/models/ctrl/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_tf_available, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig"],
26 |     "tokenization_ctrl": ["CTRLTokenizer"],
27 | }
28 | 
29 | if is_torch_available():
30 |     _import_structure["modeling_ctrl"] = [
31 |         "CTRL_PRETRAINED_MODEL_ARCHIVE_LIST",
32 |         "CTRLForSequenceClassification",
33 |         "CTRLLMHeadModel",
34 |         "CTRLModel",
35 |         "CTRLPreTrainedModel",
36 |     ]
37 | 
38 | if is_tf_available():
39 |     _import_structure["modeling_tf_ctrl"] = [
40 |         "TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST",
41 |         "TFCTRLForSequenceClassification",
42 |         "TFCTRLLMHeadModel",
43 |         "TFCTRLModel",
44 |         "TFCTRLPreTrainedModel",
45 |     ]
46 | 
47 | 
48 | if TYPE_CHECKING:
49 |     from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
50 |     from .tokenization_ctrl import CTRLTokenizer
51 | 
52 |     if is_torch_available():
53 |         from .modeling_ctrl import (
54 |             CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
55 |             CTRLForSequenceClassification,
56 |             CTRLLMHeadModel,
57 |             CTRLModel,
58 |             CTRLPreTrainedModel,
59 |         )
60 | 
61 |     if is_tf_available():
62 |         from .modeling_tf_ctrl import (
63 |             TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
64 |             TFCTRLForSequenceClassification,
65 |             TFCTRLLMHeadModel,
66 |             TFCTRLModel,
67 |             TFCTRLPreTrainedModel,
68 |         )
69 | 
70 | else:
71 |     import importlib
72 |     import os
73 |     import sys
74 | 
75 |     class _LazyModule(_BaseLazyModule):
76 |         """
77 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
78 |         """
79 | 
80 |         __file__ = globals()["__file__"]
81 |         __path__ = [os.path.dirname(__file__)]
82 | 
83 |         def _get_module(self, module_name: str):
84 |             return importlib.import_module("." + module_name, self.__name__)
85 | 
86 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
87 | 


--------------------------------------------------------------------------------
/transformers/models/led/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2021 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | from typing import TYPE_CHECKING
19 | 
20 | from ...file_utils import _BaseLazyModule, is_tf_available, is_tokenizers_available, is_torch_available
21 | 
22 | 
23 | _import_structure = {
24 |     "configuration_led": ["LED_PRETRAINED_CONFIG_ARCHIVE_MAP", "LEDConfig"],
25 |     "tokenization_led": ["LEDTokenizer"],
26 | }
27 | 
28 | if is_tokenizers_available():
29 |     _import_structure["tokenization_led_fast"] = ["LEDTokenizerFast"]
30 | 
31 | if is_torch_available():
32 |     _import_structure["modeling_led"] = [
33 |         "LED_PRETRAINED_MODEL_ARCHIVE_LIST",
34 |         "LEDForConditionalGeneration",
35 |         "LEDForQuestionAnswering",
36 |         "LEDForSequenceClassification",
37 |         "LEDModel",
38 |         "LEDPreTrainedModel",
39 |     ]
40 | 
41 | 
42 | if is_tf_available():
43 |     _import_structure["modeling_tf_led"] = ["TFLEDForConditionalGeneration", "TFLEDModel", "TFLEDPreTrainedModel"]
44 | 
45 | 
46 | if TYPE_CHECKING:
47 |     from .configuration_led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig
48 |     from .tokenization_led import LEDTokenizer
49 | 
50 |     if is_tokenizers_available():
51 |         from .tokenization_led_fast import LEDTokenizerFast
52 | 
53 |     if is_torch_available():
54 |         from .modeling_led import (
55 |             LED_PRETRAINED_MODEL_ARCHIVE_LIST,
56 |             LEDForConditionalGeneration,
57 |             LEDForQuestionAnswering,
58 |             LEDForSequenceClassification,
59 |             LEDModel,
60 |             LEDPreTrainedModel,
61 |         )
62 | 
63 |     if is_tf_available():
64 |         from .modeling_tf_led import TFLEDForConditionalGeneration, TFLEDModel, TFLEDPreTrainedModel
65 | 
66 | else:
67 |     import importlib
68 |     import os
69 |     import sys
70 | 
71 |     class _LazyModule(_BaseLazyModule):
72 |         """
73 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
74 |         """
75 | 
76 |         __file__ = globals()["__file__"]
77 |         __path__ = [os.path.dirname(__file__)]
78 | 
79 |         def _get_module(self, module_name: str):
80 |             return importlib.import_module("." + module_name, self.__name__)
81 | 
82 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
83 | 


--------------------------------------------------------------------------------
/transformers/models/roberta/configuration_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 | 
18 | from ...utils import logging
19 | from ..bert.configuration_bert import BertConfig
20 | 
21 | 
22 | logger = logging.get_logger(__name__)
23 | 
24 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25 |     "roberta-base": "https://huggingface.co/roberta-base/resolve/main/config.json",
26 |     "roberta-large": "https://huggingface.co/roberta-large/resolve/main/config.json",
27 |     "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/config.json",
28 |     "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/config.json",
29 |     "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/config.json",
30 |     "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/config.json",
31 | }
32 | 
33 | 
34 | class RobertaConfig(BertConfig):
35 |     r"""
36 |     This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel` or a
37 |     :class:`~transformers.TFRobertaModel`. It is used to instantiate a RoBERTa model according to the specified
38 |     arguments, defining the model architecture.
39 | 
40 | 
41 |     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
42 |     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
43 | 
44 |     The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the
45 |     same defaults. Please check the parent class for more information.
46 | 
47 |     Examples::
48 | 
49 |         >>> from transformers import RobertaConfig, RobertaModel
50 | 
51 |         >>> # Initializing a RoBERTa configuration
52 |         >>> configuration = RobertaConfig()
53 | 
54 |         >>> # Initializing a model from the configuration
55 |         >>> model = RobertaModel(configuration)
56 | 
57 |         >>> # Accessing the model configuration
58 |         >>> configuration = model.config
59 |     """
60 |     model_type = "roberta"
61 | 
62 |     def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
63 |         """Constructs RobertaConfig."""
64 |         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
65 | 


--------------------------------------------------------------------------------
/transformers/activations_tf.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import math
16 | 
17 | import tensorflow as tf
18 | from packaging import version
19 | 
20 | 
21 | def _gelu(x):
22 |     """
23 |     Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
24 |     initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
25 |     0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see
26 |     https://arxiv.org/abs/1606.08415
27 |     """
28 |     x = tf.convert_to_tensor(x)
29 |     cdf = 0.5 * (1.0 + tf.math.erf(x / tf.cast(tf.sqrt(2.0), x.dtype)))
30 | 
31 |     return x * cdf
32 | 
33 | 
34 | def _gelu_new(x):
35 |     """
36 |     Gaussian Error Linear Unit. This is a smoother version of the GELU. Original paper: https://arxiv.org/abs/1606.0841
37 | 
38 |     Args:
39 |         x: float Tensor to perform activation
40 | 
41 |     Returns:
42 |         `x` with the GELU activation applied.
43 |     """
44 |     x = tf.convert_to_tensor(x)
45 |     pi = tf.cast(math.pi, x.dtype)
46 |     coeff = tf.cast(0.044715, x.dtype)
47 |     cdf = 0.5 * (1.0 + tf.tanh(tf.sqrt(2.0 / pi) * (x + coeff * tf.pow(x, 3))))
48 | 
49 |     return x * cdf
50 | 
51 | 
52 | def mish(x):
53 |     x = tf.convert_to_tensor(x)
54 | 
55 |     return x * tf.tanh(tf.math.softplus(x))
56 | 
57 | 
58 | def gelu_fast(x):
59 |     x = tf.convert_to_tensor(x)
60 |     coeff1 = tf.cast(0.7978845608, x.dtype)
61 |     coeff2 = tf.cast(0.044715, x.dtype)
62 | 
63 |     return 0.5 * x * (1.0 + tf.tanh(x * coeff2 * (1.0 + coeff1 * x * x)))
64 | 
65 | 
66 | if version.parse(tf.version.VERSION) >= version.parse("2.4"):
67 | 
68 |     def approximate_gelu_wrap(x):
69 |         return tf.keras.activations.gelu(x, approximate=True)
70 | 
71 |     gelu = tf.keras.activations.gelu
72 |     gelu_new = approximate_gelu_wrap
73 | else:
74 |     gelu = _gelu
75 |     gelu_new = _gelu_new
76 | 
77 | 
78 | ACT2FN = {
79 |     "gelu": gelu,
80 |     "relu": tf.keras.activations.relu,
81 |     "swish": tf.keras.activations.swish,
82 |     "silu": tf.keras.activations.swish,
83 |     "gelu_new": gelu_new,
84 |     "mish": mish,
85 |     "tanh": tf.keras.activations.tanh,
86 |     "gelu_fast": gelu_fast,
87 | }
88 | 
89 | 
90 | def get_tf_activation(activation_string):
91 |     if activation_string in ACT2FN:
92 |         return ACT2FN[activation_string]
93 |     else:
94 |         raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys())))
95 | 


--------------------------------------------------------------------------------
/transformers/models/reformer/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_sentencepiece_available, is_tokenizers_available, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_reformer": ["REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "ReformerConfig"],
26 | }
27 | 
28 | if is_sentencepiece_available():
29 |     _import_structure["tokenization_reformer"] = ["ReformerTokenizer"]
30 | 
31 | if is_tokenizers_available():
32 |     _import_structure["tokenization_reformer_fast"] = ["ReformerTokenizerFast"]
33 | 
34 | if is_torch_available():
35 |     _import_structure["modeling_reformer"] = [
36 |         "REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
37 |         "ReformerAttention",
38 |         "ReformerForMaskedLM",
39 |         "ReformerForQuestionAnswering",
40 |         "ReformerForSequenceClassification",
41 |         "ReformerLayer",
42 |         "ReformerModel",
43 |         "ReformerModelWithLMHead",
44 |     ]
45 | 
46 | 
47 | if TYPE_CHECKING:
48 |     from .configuration_reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig
49 | 
50 |     if is_sentencepiece_available():
51 |         from .tokenization_reformer import ReformerTokenizer
52 | 
53 |     if is_tokenizers_available():
54 |         from .tokenization_reformer_fast import ReformerTokenizerFast
55 | 
56 |     if is_torch_available():
57 |         from .modeling_reformer import (
58 |             REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
59 |             ReformerAttention,
60 |             ReformerForMaskedLM,
61 |             ReformerForQuestionAnswering,
62 |             ReformerForSequenceClassification,
63 |             ReformerLayer,
64 |             ReformerModel,
65 |             ReformerModelWithLMHead,
66 |         )
67 | 
68 | else:
69 |     import importlib
70 |     import os
71 |     import sys
72 | 
73 |     class _LazyModule(_BaseLazyModule):
74 |         """
75 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
76 |         """
77 | 
78 |         __file__ = globals()["__file__"]
79 |         __path__ = [os.path.dirname(__file__)]
80 | 
81 |         def _get_module(self, module_name: str):
82 |             return importlib.import_module("." + module_name, self.__name__)
83 | 
84 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
85 | 


--------------------------------------------------------------------------------
/transformers/models/electra/tokenization_electra.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from ..bert.tokenization_bert import BertTokenizer
17 | 
18 | 
19 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
20 | 
21 | PRETRAINED_VOCAB_FILES_MAP = {
22 |     "vocab_file": {
23 |         "google/electra-small-generator": "https://huggingface.co/google/electra-small-generator/resolve/main/vocab.txt",
24 |         "google/electra-base-generator": "https://huggingface.co/google/electra-base-generator/resolve/main/vocab.txt",
25 |         "google/electra-large-generator": "https://huggingface.co/google/electra-large-generator/resolve/main/vocab.txt",
26 |         "google/electra-small-discriminator": "https://huggingface.co/google/electra-small-discriminator/resolve/main/vocab.txt",
27 |         "google/electra-base-discriminator": "https://huggingface.co/google/electra-base-discriminator/resolve/main/vocab.txt",
28 |         "google/electra-large-discriminator": "https://huggingface.co/google/electra-large-discriminator/resolve/main/vocab.txt",
29 |     }
30 | }
31 | 
32 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
33 |     "google/electra-small-generator": 512,
34 |     "google/electra-base-generator": 512,
35 |     "google/electra-large-generator": 512,
36 |     "google/electra-small-discriminator": 512,
37 |     "google/electra-base-discriminator": 512,
38 |     "google/electra-large-discriminator": 512,
39 | }
40 | 
41 | 
42 | PRETRAINED_INIT_CONFIGURATION = {
43 |     "google/electra-small-generator": {"do_lower_case": True},
44 |     "google/electra-base-generator": {"do_lower_case": True},
45 |     "google/electra-large-generator": {"do_lower_case": True},
46 |     "google/electra-small-discriminator": {"do_lower_case": True},
47 |     "google/electra-base-discriminator": {"do_lower_case": True},
48 |     "google/electra-large-discriminator": {"do_lower_case": True},
49 | }
50 | 
51 | 
52 | class ElectraTokenizer(BertTokenizer):
53 |     r"""
54 |     Construct an ELECTRA tokenizer.
55 | 
56 |     :class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
57 |     tokenization: punctuation splitting and wordpiece.
58 | 
59 |     Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
60 |     parameters.
61 |     """
62 | 
63 |     vocab_files_names = VOCAB_FILES_NAMES
64 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
65 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
66 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
67 | 


--------------------------------------------------------------------------------
/transformers/models/pegasus/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | from typing import TYPE_CHECKING
19 | 
20 | from ...file_utils import (
21 |     _BaseLazyModule,
22 |     is_sentencepiece_available,
23 |     is_tf_available,
24 |     is_tokenizers_available,
25 |     is_torch_available,
26 | )
27 | 
28 | 
29 | _import_structure = {
30 |     "configuration_pegasus": ["PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP", "PegasusConfig"],
31 | }
32 | 
33 | if is_sentencepiece_available():
34 |     _import_structure["tokenization_pegasus"] = ["PegasusTokenizer"]
35 | 
36 | if is_tokenizers_available():
37 |     _import_structure["tokenization_pegasus_fast"] = ["PegasusTokenizerFast"]
38 | 
39 | if is_torch_available():
40 |     _import_structure["modeling_pegasus"] = [
41 |         "PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST",
42 |         "PegasusForConditionalGeneration",
43 |         "PegasusModel",
44 |         "PegasusPreTrainedModel",
45 |     ]
46 | 
47 | if is_tf_available():
48 |     _import_structure["modeling_tf_pegasus"] = ["TFPegasusForConditionalGeneration", "TFPegasusModel"]
49 | 
50 | 
51 | if TYPE_CHECKING:
52 |     from .configuration_pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig
53 | 
54 |     if is_sentencepiece_available():
55 |         from .tokenization_pegasus import PegasusTokenizer
56 | 
57 |     if is_tokenizers_available():
58 |         from .tokenization_pegasus_fast import PegasusTokenizerFast
59 | 
60 |     if is_torch_available():
61 |         from .modeling_pegasus import (
62 |             PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST,
63 |             PegasusForConditionalGeneration,
64 |             PegasusModel,
65 |             PegasusPreTrainedModel,
66 |         )
67 | 
68 |     if is_tf_available():
69 |         from .modeling_tf_pegasus import TFPegasusForConditionalGeneration, TFPegasusModel
70 | 
71 | else:
72 |     import importlib
73 |     import os
74 |     import sys
75 | 
76 |     class _LazyModule(_BaseLazyModule):
77 |         """
78 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
79 |         """
80 | 
81 |         __file__ = globals()["__file__"]
82 |         __path__ = [os.path.dirname(__file__)]
83 | 
84 |         def _get_module(self, module_name: str):
85 |             return importlib.import_module("." + module_name, self.__name__)
86 | 
87 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
88 | 


--------------------------------------------------------------------------------
/transformers/models/bart/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | from typing import TYPE_CHECKING
19 | 
20 | from ...file_utils import _BaseLazyModule, is_tf_available, is_tokenizers_available, is_torch_available
21 | 
22 | 
23 | _import_structure = {
24 |     "configuration_bart": ["BART_PRETRAINED_CONFIG_ARCHIVE_MAP", "BartConfig"],
25 |     "tokenization_bart": ["BartTokenizer"],
26 | }
27 | 
28 | if is_tokenizers_available():
29 |     _import_structure["tokenization_bart_fast"] = ["BartTokenizerFast"]
30 | 
31 | if is_torch_available():
32 |     _import_structure["modeling_bart"] = [
33 |         "BART_PRETRAINED_MODEL_ARCHIVE_LIST",
34 |         "BartForConditionalGeneration",
35 |         "BartForQuestionAnswering",
36 |         "BartForSequenceClassification",
37 |         "BartModel",
38 |         "BartPretrainedModel",
39 |         "PretrainedBartModel",
40 |     ]
41 | 
42 | if is_tf_available():
43 |     _import_structure["modeling_tf_bart"] = ["TFBartForConditionalGeneration", "TFBartModel", "TFBartPretrainedModel"]
44 | 
45 | 
46 | if TYPE_CHECKING:
47 |     from .configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartConfig
48 |     from .tokenization_bart import BartTokenizer
49 | 
50 |     if is_tokenizers_available():
51 |         from .tokenization_bart_fast import BartTokenizerFast
52 | 
53 |     if is_torch_available():
54 |         from .modeling_bart import (
55 |             BART_PRETRAINED_MODEL_ARCHIVE_LIST,
56 |             BartForConditionalGeneration,
57 |             BartForQuestionAnswering,
58 |             BartForSequenceClassification,
59 |             BartModel,
60 |             BartPretrainedModel,
61 |             PretrainedBartModel,
62 |         )
63 | 
64 |     if is_tf_available():
65 |         from .modeling_tf_bart import TFBartForConditionalGeneration, TFBartModel, TFBartPretrainedModel
66 | 
67 | else:
68 |     import importlib
69 |     import os
70 |     import sys
71 | 
72 |     class _LazyModule(_BaseLazyModule):
73 |         """
74 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
75 |         """
76 | 
77 |         __file__ = globals()["__file__"]
78 |         __path__ = [os.path.dirname(__file__)]
79 | 
80 |         def _get_module(self, module_name: str):
81 |             return importlib.import_module("." + module_name, self.__name__)
82 | 
83 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
84 | 


--------------------------------------------------------------------------------
/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert ELECTRA checkpoint."""
16 | 
17 | 
18 | import argparse
19 | 
20 | import torch
21 | 
22 | from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra
23 | from transformers.utils import logging
24 | 
25 | 
26 | logging.set_verbosity_info()
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator):
30 |     # Initialise PyTorch model
31 |     config = ElectraConfig.from_json_file(config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 | 
34 |     if discriminator_or_generator == "discriminator":
35 |         model = ElectraForPreTraining(config)
36 |     elif discriminator_or_generator == "generator":
37 |         model = ElectraForMaskedLM(config)
38 |     else:
39 |         raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'")
40 | 
41 |     # Load weights from tf checkpoint
42 |     load_tf_weights_in_electra(
43 |         model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator
44 |     )
45 | 
46 |     # Save pytorch-model
47 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
48 |     torch.save(model.state_dict(), pytorch_dump_path)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     parser = argparse.ArgumentParser()
53 |     # Required parameters
54 |     parser.add_argument(
55 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
56 |     )
57 |     parser.add_argument(
58 |         "--config_file",
59 |         default=None,
60 |         type=str,
61 |         required=True,
62 |         help="The config json file corresponding to the pre-trained model. \n"
63 |         "This specifies the model architecture.",
64 |     )
65 |     parser.add_argument(
66 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
67 |     )
68 |     parser.add_argument(
69 |         "--discriminator_or_generator",
70 |         default=None,
71 |         type=str,
72 |         required=True,
73 |         help="Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or "
74 |         "'generator'.",
75 |     )
76 |     args = parser.parse_args()
77 |     convert_tf_checkpoint_to_pytorch(
78 |         args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator
79 |     )
80 | 


--------------------------------------------------------------------------------
/transformers/models/lxmert/tokenization_lxmert_fast.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from ..bert.tokenization_bert_fast import BertTokenizerFast
17 | from .tokenization_lxmert import LxmertTokenizer
18 | 
19 | 
20 | ####################################################
21 | # Mapping from the keyword arguments names of Tokenizer `__init__`
22 | # to file names for serializing Tokenizer instances
23 | ####################################################
24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
25 | 
26 | ####################################################
27 | # Mapping from the keyword arguments names of Tokenizer `__init__`
28 | # to pretrained vocabulary URL for all the model ids.
29 | ####################################################
30 | PRETRAINED_VOCAB_FILES_MAP = {
31 |     "vocab_file": {
32 |         "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
33 |     },
34 |     "tokenizer_file": {
35 |         "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
36 |     },
37 | }
38 | 
39 | ####################################################
40 | # Mapping from model ids to max length of inputs
41 | ####################################################
42 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
43 |     "unc-nlp/lxmert-base-uncased": 512,
44 | }
45 | ####################################################
46 | # Mapping from model ids to a dictionary of additional
47 | # keyword arguments for Tokenizer `__init__`.
48 | # To be used for checkpoint specific configurations.
49 | ####################################################
50 | PRETRAINED_INIT_CONFIGURATION = {
51 |     "unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
52 | }
53 | 
54 | 
55 | class LxmertTokenizerFast(BertTokenizerFast):
56 |     r"""
57 |     Construct a "fast" LXMERT tokenizer (backed by HuggingFace's `tokenizers` library).
58 | 
59 |     :class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
60 |     end-to-end tokenization: punctuation splitting and wordpiece.
61 | 
62 |     Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
63 |     parameters.
64 |     """
65 |     vocab_files_names = VOCAB_FILES_NAMES
66 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
67 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
68 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
69 |     slow_tokenizer_class = LxmertTokenizer
70 | 


--------------------------------------------------------------------------------
/transformers/models/herbert/tokenization_herbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from ...utils import logging
17 | from ..bert.tokenization_bert import BasicTokenizer
18 | from ..xlm.tokenization_xlm import XLMTokenizer
19 | 
20 | 
21 | logger = logging.get_logger(__name__)
22 | 
23 | VOCAB_FILES_NAMES = {
24 |     "vocab_file": "vocab.json",
25 |     "merges_file": "merges.txt",
26 | }
27 | 
28 | PRETRAINED_VOCAB_FILES_MAP = {
29 |     "vocab_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/vocab.json"},
30 |     "merges_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/merges.txt"},
31 | }
32 | 
33 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514}
34 | PRETRAINED_INIT_CONFIGURATION = {}
35 | 
36 | 
37 | class HerbertTokenizer(XLMTokenizer):
38 |     """
39 |     Construct a BPE tokenizer for HerBERT.
40 | 
41 |     Peculiarities:
42 | 
43 |     - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a
44 |       punctuation character will be treated separately.
45 | 
46 |     - Such pretokenized input is BPE subtokenized
47 | 
48 |     This tokenizer inherits from :class:`~transformers.XLMTokenizer` which contains most of the methods. Users should
49 |     refer to the superclass for more information regarding methods.
50 |     """
51 | 
52 |     vocab_files_names = VOCAB_FILES_NAMES
53 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
54 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
55 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
56 | 
57 |     def __init__(self, **kwargs):
58 | 
59 |         kwargs["cls_token"] = "<s>"
60 |         kwargs["unk_token"] = "<unk>"
61 |         kwargs["pad_token"] = "<pad>"
62 |         kwargs["mask_token"] = "<mask>"
63 |         kwargs["sep_token"] = "</s>"
64 |         kwargs["do_lowercase_and_remove_accent"] = False
65 |         kwargs["additional_special_tokens"] = []
66 | 
67 |         super().__init__(**kwargs)
68 |         self.bert_pre_tokenizer = BasicTokenizer(
69 |             do_lower_case=False, never_split=self.all_special_tokens, tokenize_chinese_chars=False, strip_accents=False
70 |         )
71 | 
72 |     def _tokenize(self, text):
73 | 
74 |         pre_tokens = self.bert_pre_tokenizer.tokenize(text)
75 | 
76 |         split_tokens = []
77 |         for token in pre_tokens:
78 |             if token:
79 |                 split_tokens.extend([t for t in self.bpe(token).split(" ")])
80 | 
81 |         return split_tokens
82 | 


--------------------------------------------------------------------------------
/transformers/models/distilbert/tokenization_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for DistilBERT."""
16 | 
17 | from ...utils import logging
18 | from ..bert.tokenization_bert import BertTokenizer
19 | 
20 | 
21 | logger = logging.get_logger(__name__)
22 | 
23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
24 | 
25 | PRETRAINED_VOCAB_FILES_MAP = {
26 |     "vocab_file": {
27 |         "distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
28 |         "distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
29 |         "distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
30 |         "distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
31 |         "distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt",
32 |         "distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt",
33 |     }
34 | }
35 | 
36 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
37 |     "distilbert-base-uncased": 512,
38 |     "distilbert-base-uncased-distilled-squad": 512,
39 |     "distilbert-base-cased": 512,
40 |     "distilbert-base-cased-distilled-squad": 512,
41 |     "distilbert-base-german-cased": 512,
42 |     "distilbert-base-multilingual-cased": 512,
43 | }
44 | 
45 | 
46 | PRETRAINED_INIT_CONFIGURATION = {
47 |     "distilbert-base-uncased": {"do_lower_case": True},
48 |     "distilbert-base-uncased-distilled-squad": {"do_lower_case": True},
49 |     "distilbert-base-cased": {"do_lower_case": False},
50 |     "distilbert-base-cased-distilled-squad": {"do_lower_case": False},
51 |     "distilbert-base-german-cased": {"do_lower_case": False},
52 |     "distilbert-base-multilingual-cased": {"do_lower_case": False},
53 | }
54 | 
55 | 
56 | class DistilBertTokenizer(BertTokenizer):
57 |     r"""
58 |     Construct a DistilBERT tokenizer.
59 | 
60 |     :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
61 |     tokenization: punctuation splitting and wordpiece.
62 | 
63 |     Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
64 |     parameters.
65 |     """
66 | 
67 |     vocab_files_names = VOCAB_FILES_NAMES
68 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
69 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
70 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
71 |     model_input_names = ["attention_mask"]
72 | 


--------------------------------------------------------------------------------
/transformers/models/squeezebert/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_tokenizers_available, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_squeezebert": ["SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "SqueezeBertConfig"],
26 |     "tokenization_squeezebert": ["SqueezeBertTokenizer"],
27 | }
28 | 
29 | if is_tokenizers_available():
30 |     _import_structure["tokenization_squeezebert_fast"] = ["SqueezeBertTokenizerFast"]
31 | 
32 | if is_torch_available():
33 |     _import_structure["modeling_squeezebert"] = [
34 |         "SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
35 |         "SqueezeBertForMaskedLM",
36 |         "SqueezeBertForMultipleChoice",
37 |         "SqueezeBertForQuestionAnswering",
38 |         "SqueezeBertForSequenceClassification",
39 |         "SqueezeBertForTokenClassification",
40 |         "SqueezeBertModel",
41 |         "SqueezeBertModule",
42 |         "SqueezeBertPreTrainedModel",
43 |     ]
44 | 
45 | 
46 | if TYPE_CHECKING:
47 |     from .configuration_squeezebert import SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, SqueezeBertConfig
48 |     from .tokenization_squeezebert import SqueezeBertTokenizer
49 | 
50 |     if is_tokenizers_available():
51 |         from .tokenization_squeezebert_fast import SqueezeBertTokenizerFast
52 | 
53 |     if is_torch_available():
54 |         from .modeling_squeezebert import (
55 |             SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
56 |             SqueezeBertForMaskedLM,
57 |             SqueezeBertForMultipleChoice,
58 |             SqueezeBertForQuestionAnswering,
59 |             SqueezeBertForSequenceClassification,
60 |             SqueezeBertForTokenClassification,
61 |             SqueezeBertModel,
62 |             SqueezeBertModule,
63 |             SqueezeBertPreTrainedModel,
64 |         )
65 | 
66 | else:
67 |     import importlib
68 |     import os
69 |     import sys
70 | 
71 |     class _LazyModule(_BaseLazyModule):
72 |         """
73 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
74 |         """
75 | 
76 |         __file__ = globals()["__file__"]
77 |         __path__ = [os.path.dirname(__file__)]
78 | 
79 |         def _get_module(self, module_name: str):
80 |             return importlib.import_module("." + module_name, self.__name__)
81 | 
82 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
83 | 


--------------------------------------------------------------------------------
/transformers/models/squeezebert/tokenization_squeezebert_fast.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for SqueezeBERT."""
16 | 
17 | from ...utils import logging
18 | from ..bert.tokenization_bert_fast import BertTokenizerFast
19 | from .tokenization_squeezebert import SqueezeBertTokenizer
20 | 
21 | 
22 | logger = logging.get_logger(__name__)
23 | 
24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
25 | 
26 | PRETRAINED_VOCAB_FILES_MAP = {
27 |     "vocab_file": {
28 |         "squeezebert/squeezebert-uncased": "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/vocab.txt",
29 |         "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/vocab.txt",
30 |         "squeezebert/squeezebert-mnli-headless": "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/vocab.txt",
31 |     },
32 |     "tokenizer_file": {
33 |         "squeezebert/squeezebert-uncased": "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/tokenizer.json",
34 |         "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/tokenizer.json",
35 |         "squeezebert/squeezebert-mnli-headless": "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/tokenizer.json",
36 |     },
37 | }
38 | 
39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
40 |     "squeezebert/squeezebert-uncased": 512,
41 |     "squeezebert/squeezebert-mnli": 512,
42 |     "squeezebert/squeezebert-mnli-headless": 512,
43 | }
44 | 
45 | 
46 | PRETRAINED_INIT_CONFIGURATION = {
47 |     "squeezebert/squeezebert-uncased": {"do_lower_case": True},
48 |     "squeezebert/squeezebert-mnli": {"do_lower_case": True},
49 |     "squeezebert/squeezebert-mnli-headless": {"do_lower_case": True},
50 | }
51 | 
52 | 
53 | class SqueezeBertTokenizerFast(BertTokenizerFast):
54 |     r"""
55 |     Constructs a "Fast" SqueezeBert tokenizer (backed by HuggingFace's `tokenizers` library).
56 | 
57 |     :class:`~transformers.SqueezeBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
58 |     end-to-end tokenization: punctuation splitting + wordpiece.
59 | 
60 |     Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
61 |     parameters.
62 |     """
63 | 
64 |     vocab_files_names = VOCAB_FILES_NAMES
65 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
66 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
67 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
68 |     slow_tokenizer_class = SqueezeBertTokenizer
69 | 


--------------------------------------------------------------------------------
/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert Seq2Seq TF Hub checkpoint."""
16 | 
17 | 
18 | import argparse
19 | 
20 | from . import (
21 |     BertConfig,
22 |     BertGenerationConfig,
23 |     BertGenerationDecoder,
24 |     BertGenerationEncoder,
25 |     load_tf_weights_in_bert_generation,
26 |     logging,
27 | )
28 | 
29 | 
30 | logging.set_verbosity_info()
31 | 
32 | 
33 | def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder):
34 |     # Initialise PyTorch model
35 |     bert_config = BertConfig.from_pretrained(
36 |         "bert-large-cased",
37 |         vocab_size=vocab_size,
38 |         max_position_embeddings=512,
39 |         is_decoder=True,
40 |         add_cross_attention=True,
41 |     )
42 |     bert_config_dict = bert_config.to_dict()
43 |     del bert_config_dict["type_vocab_size"]
44 |     config = BertGenerationConfig(**bert_config_dict)
45 |     if is_encoder:
46 |         model = BertGenerationEncoder(config)
47 |     else:
48 |         model = BertGenerationDecoder(config)
49 |     print("Building PyTorch model from configuration: {}".format(str(config)))
50 | 
51 |     # Load weights from tf checkpoint
52 |     load_tf_weights_in_bert_generation(
53 |         model,
54 |         tf_hub_path,
55 |         model_class="bert",
56 |         is_encoder_named_decoder=is_encoder_named_decoder,
57 |         is_encoder=is_encoder,
58 |     )
59 | 
60 |     # Save pytorch-model
61 |     print("Save PyTorch model and config to {}".format(pytorch_dump_path))
62 |     model.save_pretrained(pytorch_dump_path)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     parser = argparse.ArgumentParser()
67 |     # Required parameters
68 |     parser.add_argument(
69 |         "--tf_hub_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
70 |     )
71 |     parser.add_argument(
72 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
73 |     )
74 |     parser.add_argument(
75 |         "--is_encoder_named_decoder",
76 |         action="store_true",
77 |         help="If decoder has to be renamed to encoder in PyTorch model.",
78 |     )
79 |     parser.add_argument("--is_encoder", action="store_true", help="If model is an encoder.")
80 |     parser.add_argument("--vocab_size", default=50358, type=int, help="Vocab size of model")
81 |     args = parser.parse_args()
82 |     convert_tf_checkpoint_to_pytorch(
83 |         args.tf_hub_path,
84 |         args.pytorch_dump_path,
85 |         args.is_encoder_named_decoder,
86 |         args.vocab_size,
87 |         is_encoder=args.is_encoder,
88 |     )
89 | 


--------------------------------------------------------------------------------
/transformers/models/mt5/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import (
22 |     _BaseLazyModule,
23 |     is_sentencepiece_available,
24 |     is_tf_available,
25 |     is_tokenizers_available,
26 |     is_torch_available,
27 | )
28 | 
29 | 
30 | if is_sentencepiece_available():
31 |     from ..t5.tokenization_t5 import T5Tokenizer
32 | 
33 |     MT5Tokenizer = T5Tokenizer
34 | 
35 | if is_tokenizers_available():
36 |     from ..t5.tokenization_t5_fast import T5TokenizerFast
37 | 
38 |     MT5TokenizerFast = T5TokenizerFast
39 | 
40 | _import_structure = {
41 |     "configuration_mt5": ["MT5Config"],
42 | }
43 | 
44 | if is_torch_available():
45 |     _import_structure["modeling_mt5"] = ["MT5EncoderModel", "MT5ForConditionalGeneration", "MT5Model"]
46 | 
47 | if is_tf_available():
48 |     _import_structure["modeling_tf_mt5"] = ["TFMT5EncoderModel", "TFMT5ForConditionalGeneration", "TFMT5Model"]
49 | 
50 | 
51 | if TYPE_CHECKING:
52 |     from .configuration_mt5 import MT5Config
53 | 
54 |     if is_sentencepiece_available():
55 |         from ..t5.tokenization_t5 import T5Tokenizer
56 | 
57 |         MT5Tokenizer = T5Tokenizer
58 | 
59 |     if is_tokenizers_available():
60 |         from ..t5.tokenization_t5_fast import T5TokenizerFast
61 | 
62 |         MT5TokenizerFast = T5TokenizerFast
63 | 
64 |     if is_torch_available():
65 |         from .modeling_mt5 import MT5EncoderModel, MT5ForConditionalGeneration, MT5Model
66 | 
67 |     if is_tf_available():
68 |         from .modeling_tf_mt5 import TFMT5EncoderModel, TFMT5ForConditionalGeneration, TFMT5Model
69 | 
70 | else:
71 |     import importlib
72 |     import os
73 |     import sys
74 | 
75 |     class _LazyModule(_BaseLazyModule):
76 |         """
77 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
78 |         """
79 | 
80 |         __file__ = globals()["__file__"]
81 |         __path__ = [os.path.dirname(__file__)]
82 | 
83 |         def _get_module(self, module_name: str):
84 |             return importlib.import_module("." + module_name, self.__name__)
85 | 
86 |         def __getattr__(self, name):
87 |             if name == "MT5Tokenizer":
88 |                 return MT5Tokenizer
89 |             elif name == name == "MT5TokenizerFast":
90 |                 return MT5TokenizerFast
91 |             else:
92 |                 return super().__getattr__(name)
93 | 
94 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
95 | 


--------------------------------------------------------------------------------
/transformers/models/mbart/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | from typing import TYPE_CHECKING
19 | 
20 | from ...file_utils import (
21 |     _BaseLazyModule,
22 |     is_sentencepiece_available,
23 |     is_tf_available,
24 |     is_tokenizers_available,
25 |     is_torch_available,
26 | )
27 | 
28 | 
29 | _import_structure = {
30 |     "configuration_mbart": ["MBART_PRETRAINED_CONFIG_ARCHIVE_MAP", "MBartConfig"],
31 | }
32 | 
33 | if is_sentencepiece_available():
34 |     _import_structure["tokenization_mbart"] = ["MBartTokenizer"]
35 | 
36 | if is_tokenizers_available():
37 |     _import_structure["tokenization_mbart_fast"] = ["MBartTokenizerFast"]
38 | 
39 | if is_torch_available():
40 |     _import_structure["modeling_mbart"] = [
41 |         "MBART_PRETRAINED_MODEL_ARCHIVE_LIST",
42 |         "MBartForConditionalGeneration",
43 |         "MBartForQuestionAnswering",
44 |         "MBartForSequenceClassification",
45 |         "MBartModel",
46 |         "MBartPreTrainedModel",
47 |     ]
48 | 
49 | if is_tf_available():
50 |     _import_structure["modeling_tf_mbart"] = ["TFMBartForConditionalGeneration", "TFMBartModel"]
51 | 
52 | 
53 | if TYPE_CHECKING:
54 |     from .configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig
55 | 
56 |     if is_sentencepiece_available():
57 |         from .tokenization_mbart import MBartTokenizer
58 | 
59 |     if is_tokenizers_available():
60 |         from .tokenization_mbart_fast import MBartTokenizerFast
61 | 
62 |     if is_torch_available():
63 |         from .modeling_mbart import (
64 |             MBART_PRETRAINED_MODEL_ARCHIVE_LIST,
65 |             MBartForConditionalGeneration,
66 |             MBartForQuestionAnswering,
67 |             MBartForSequenceClassification,
68 |             MBartModel,
69 |             MBartPreTrainedModel,
70 |         )
71 | 
72 |     if is_tf_available():
73 |         from .modeling_tf_mbart import TFMBartForConditionalGeneration, TFMBartModel
74 | 
75 | else:
76 |     import importlib
77 |     import os
78 |     import sys
79 | 
80 |     class _LazyModule(_BaseLazyModule):
81 |         """
82 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
83 |         """
84 | 
85 |         __file__ = globals()["__file__"]
86 |         __path__ = [os.path.dirname(__file__)]
87 | 
88 |         def _get_module(self, module_name: str):
89 |             return importlib.import_module("." + module_name, self.__name__)
90 | 
91 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
92 | 


--------------------------------------------------------------------------------
/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import json
20 | 
21 | import numpy
22 | import torch
23 | 
24 | from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME
25 | from transformers.models.xlm.tokenization_xlm import VOCAB_FILES_NAMES
26 | from transformers.utils import logging
27 | 
28 | 
29 | logging.set_verbosity_info()
30 | 
31 | 
32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
33 |     # Load checkpoint
34 |     chkpt = torch.load(xlm_checkpoint_path, map_location="cpu")
35 | 
36 |     state_dict = chkpt["model"]
37 | 
38 |     # We have the base model one level deeper than the original XLM repository
39 |     two_levels_state_dict = {}
40 |     for k, v in state_dict.items():
41 |         if "pred_layer" in k:
42 |             two_levels_state_dict[k] = v
43 |         else:
44 |             two_levels_state_dict["transformer." + k] = v
45 | 
46 |     config = chkpt["params"]
47 |     config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
48 | 
49 |     vocab = chkpt["dico_word2id"]
50 |     vocab = dict((s + "</w>" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items())
51 | 
52 |     # Save pytorch-model
53 |     pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
54 |     pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
55 |     pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"]
56 | 
57 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
58 |     torch.save(two_levels_state_dict, pytorch_weights_dump_path)
59 | 
60 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
61 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
62 |         f.write(json.dumps(config, indent=2) + "\n")
63 | 
64 |     print("Save vocab file to {}".format(pytorch_config_dump_path))
65 |     with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
66 |         f.write(json.dumps(vocab, indent=2) + "\n")
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     parser = argparse.ArgumentParser()
71 |     # Required parameters
72 |     parser.add_argument(
73 |         "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
74 |     )
75 |     parser.add_argument(
76 |         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
77 |     )
78 |     args = parser.parse_args()
79 |     convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
80 | 


--------------------------------------------------------------------------------
/transformers/models/openai/tokenization_openai_fast.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Fast Tokenization classes for OpenAI GPT."""
16 | 
17 | 
18 | from typing import Optional, Tuple
19 | 
20 | from ...tokenization_utils_fast import PreTrainedTokenizerFast
21 | from ...utils import logging
22 | from .tokenization_openai import OpenAIGPTTokenizer
23 | 
24 | 
25 | logger = logging.get_logger(__name__)
26 | 
27 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
28 | 
29 | PRETRAINED_VOCAB_FILES_MAP = {
30 |     "vocab_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/vocab.json"},
31 |     "merges_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/merges.txt"},
32 |     "tokenizer_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/tokenizer.json"},
33 | }
34 | 
35 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
36 |     "openai-gpt": 512,
37 | }
38 | 
39 | 
40 | class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
41 |     """
42 |     Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with
43 |     the following peculiarities:
44 | 
45 |     - lower case all inputs
46 |     - uses BERT's BasicTokenizer for pre-BPE tokenization
47 | 
48 |     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
49 |     methods. Users should refer to this superclass for more information regarding those methods.
50 | 
51 |     Args:
52 |         vocab_file (:obj:`str`):
53 |             Path to the vocabulary file.
54 |         merges_file (:obj:`str`):
55 |             Path to the merges file.
56 |         unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
57 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
58 |             token instead.
59 |     """
60 | 
61 |     vocab_files_names = VOCAB_FILES_NAMES
62 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
63 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
64 |     model_input_names = ["attention_mask"]
65 |     slow_tokenizer_class = OpenAIGPTTokenizer
66 | 
67 |     def __init__(self, vocab_file, merges_file, tokenizer_file=None, unk_token="<unk>", **kwargs):
68 |         super().__init__(vocab_file, merges_file, tokenizer_file=tokenizer_file, unk_token=unk_token, **kwargs)
69 | 
70 |     @property
71 |     def do_lower_case(self):
72 |         return True
73 | 
74 |     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
75 |         files = self._tokenizer.model.save(save_directory, name=filename_prefix)
76 |         return tuple(files)
77 | 


--------------------------------------------------------------------------------
/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert RoBERTa checkpoint."""
16 | 
17 | 
18 | import argparse
19 | 
20 | import pytorch_lightning as pl
21 | import torch
22 | 
23 | from transformers import LongformerForQuestionAnswering, LongformerModel
24 | 
25 | 
26 | class LightningModel(pl.LightningModule):
27 |     def __init__(self, model):
28 |         super().__init__()
29 |         self.model = model
30 |         self.num_labels = 2
31 |         self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels)
32 | 
33 |     # implement only because lightning requires to do so
34 |     def forward(self):
35 |         pass
36 | 
37 | 
38 | def convert_longformer_qa_checkpoint_to_pytorch(
39 |     longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str
40 | ):
41 | 
42 |     # load longformer model from model identifier
43 |     longformer = LongformerModel.from_pretrained(longformer_model)
44 |     lightning_model = LightningModel(longformer)
45 | 
46 |     ckpt = torch.load(longformer_question_answering_ckpt_path, map_location=torch.device("cpu"))
47 |     lightning_model.load_state_dict(ckpt["state_dict"])
48 | 
49 |     # init longformer question answering model
50 |     longformer_for_qa = LongformerForQuestionAnswering.from_pretrained(longformer_model)
51 | 
52 |     # transfer weights
53 |     longformer_for_qa.longformer.load_state_dict(lightning_model.model.state_dict())
54 |     longformer_for_qa.qa_outputs.load_state_dict(lightning_model.qa_outputs.state_dict())
55 |     longformer_for_qa.eval()
56 | 
57 |     # save model
58 |     longformer_for_qa.save_pretrained(pytorch_dump_folder_path)
59 | 
60 |     print("Conversion successful. Model saved under {}".format(pytorch_dump_folder_path))
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     parser = argparse.ArgumentParser()
65 |     # Required parameters
66 |     parser.add_argument(
67 |         "--longformer_model",
68 |         default=None,
69 |         type=str,
70 |         required=True,
71 |         help="model identifier of longformer. Should be either `longformer-base-4096` or `longformer-large-4096`.",
72 |     )
73 |     parser.add_argument(
74 |         "--longformer_question_answering_ckpt_path",
75 |         default=None,
76 |         type=str,
77 |         required=True,
78 |         help="Path the official PyTorch Lightning Checkpoint.",
79 |     )
80 |     parser.add_argument(
81 |         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
82 |     )
83 |     args = parser.parse_args()
84 |     convert_longformer_qa_checkpoint_to_pytorch(
85 |         args.longformer_model, args.longformer_question_answering_ckpt_path, args.pytorch_dump_folder_path
86 |     )
87 | 


--------------------------------------------------------------------------------
/transformers/models/rag/tokenization_rag.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for RAG."""
16 | import os
17 | from typing import List, Optional
18 | 
19 | from ...tokenization_utils_base import BatchEncoding
20 | from ...utils import logging
21 | from .configuration_rag import RagConfig
22 | 
23 | 
24 | logger = logging.get_logger(__name__)
25 | 
26 | 
27 | class RagTokenizer:
28 |     def __init__(self, question_encoder, generator):
29 |         self.question_encoder = question_encoder
30 |         self.generator = generator
31 | 
32 |     def save_pretrained(self, save_directory):
33 |         if os.path.isfile(save_directory):
34 |             raise ValueError("Provided path ({}) should be a directory, not a file".format(save_directory))
35 |         os.makedirs(save_directory, exist_ok=True)
36 |         question_encoder_path = os.path.join(save_directory, "question_encoder_tokenizer")
37 |         generator_path = os.path.join(save_directory, "generator_tokenizer")
38 |         self.question_encoder.save_pretrained(question_encoder_path)
39 |         self.generator.save_pretrained(generator_path)
40 | 
41 |     @classmethod
42 |     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
43 |         # dynamically import AutoTokenizer
44 |         from ..auto.tokenization_auto import AutoTokenizer
45 | 
46 |         config = kwargs.pop("config", None)
47 | 
48 |         if config is None:
49 |             config = RagConfig.from_pretrained(pretrained_model_name_or_path)
50 | 
51 |         question_encoder = AutoTokenizer.from_pretrained(
52 |             pretrained_model_name_or_path, config=config.question_encoder, subfolder="question_encoder_tokenizer"
53 |         )
54 |         generator = AutoTokenizer.from_pretrained(
55 |             pretrained_model_name_or_path, config=config.generator, subfolder="generator_tokenizer"
56 |         )
57 |         return cls(question_encoder=question_encoder, generator=generator)
58 | 
59 |     def __call__(self, *args, **kwargs):
60 |         return self.question_encoder(*args, **kwargs)
61 | 
62 |     def batch_decode(self, *args, **kwargs):
63 |         return self.generator.batch_decode(*args, **kwargs)
64 | 
65 |     def prepare_seq2seq_batch(
66 |         self,
67 |         src_texts: List[str],
68 |         tgt_texts: Optional[List[str]] = None,
69 |         max_length: Optional[int] = None,
70 |         max_target_length: Optional[int] = None,
71 |         **kwargs,
72 |     ) -> BatchEncoding:
73 |         if max_length is None:
74 |             max_length = self.question_encoder.model_max_length
75 |         if max_target_length is None:
76 |             max_target_length = self.generator.model_max_length
77 |         return super().prepare_seq2seq_batch(
78 |             src_texts, tgt_texts, max_length=max_length, max_target_length=max_target_length, **kwargs
79 |         )
80 | 


--------------------------------------------------------------------------------
/transformers/utils/dummy_sentencepiece_objects.py:
--------------------------------------------------------------------------------
  1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
  2 | from ..file_utils import requires_sentencepiece
  3 | 
  4 | 
  5 | class AlbertTokenizer:
  6 |     def __init__(self, *args, **kwargs):
  7 |         requires_sentencepiece(self)
  8 | 
  9 |     @classmethod
 10 |     def from_pretrained(self, *args, **kwargs):
 11 |         requires_sentencepiece(self)
 12 | 
 13 | 
 14 | class BarthezTokenizer:
 15 |     def __init__(self, *args, **kwargs):
 16 |         requires_sentencepiece(self)
 17 | 
 18 |     @classmethod
 19 |     def from_pretrained(self, *args, **kwargs):
 20 |         requires_sentencepiece(self)
 21 | 
 22 | 
 23 | class BertGenerationTokenizer:
 24 |     def __init__(self, *args, **kwargs):
 25 |         requires_sentencepiece(self)
 26 | 
 27 |     @classmethod
 28 |     def from_pretrained(self, *args, **kwargs):
 29 |         requires_sentencepiece(self)
 30 | 
 31 | 
 32 | class CamembertTokenizer:
 33 |     def __init__(self, *args, **kwargs):
 34 |         requires_sentencepiece(self)
 35 | 
 36 |     @classmethod
 37 |     def from_pretrained(self, *args, **kwargs):
 38 |         requires_sentencepiece(self)
 39 | 
 40 | 
 41 | class MarianTokenizer:
 42 |     def __init__(self, *args, **kwargs):
 43 |         requires_sentencepiece(self)
 44 | 
 45 |     @classmethod
 46 |     def from_pretrained(self, *args, **kwargs):
 47 |         requires_sentencepiece(self)
 48 | 
 49 | 
 50 | class MBartTokenizer:
 51 |     def __init__(self, *args, **kwargs):
 52 |         requires_sentencepiece(self)
 53 | 
 54 |     @classmethod
 55 |     def from_pretrained(self, *args, **kwargs):
 56 |         requires_sentencepiece(self)
 57 | 
 58 | 
 59 | class MT5Tokenizer:
 60 |     def __init__(self, *args, **kwargs):
 61 |         requires_sentencepiece(self)
 62 | 
 63 |     @classmethod
 64 |     def from_pretrained(self, *args, **kwargs):
 65 |         requires_sentencepiece(self)
 66 | 
 67 | 
 68 | class PegasusTokenizer:
 69 |     def __init__(self, *args, **kwargs):
 70 |         requires_sentencepiece(self)
 71 | 
 72 |     @classmethod
 73 |     def from_pretrained(self, *args, **kwargs):
 74 |         requires_sentencepiece(self)
 75 | 
 76 | 
 77 | class ReformerTokenizer:
 78 |     def __init__(self, *args, **kwargs):
 79 |         requires_sentencepiece(self)
 80 | 
 81 |     @classmethod
 82 |     def from_pretrained(self, *args, **kwargs):
 83 |         requires_sentencepiece(self)
 84 | 
 85 | 
 86 | class T5Tokenizer:
 87 |     def __init__(self, *args, **kwargs):
 88 |         requires_sentencepiece(self)
 89 | 
 90 |     @classmethod
 91 |     def from_pretrained(self, *args, **kwargs):
 92 |         requires_sentencepiece(self)
 93 | 
 94 | 
 95 | class XLMProphetNetTokenizer:
 96 |     def __init__(self, *args, **kwargs):
 97 |         requires_sentencepiece(self)
 98 | 
 99 |     @classmethod
100 |     def from_pretrained(self, *args, **kwargs):
101 |         requires_sentencepiece(self)
102 | 
103 | 
104 | class XLMRobertaTokenizer:
105 |     def __init__(self, *args, **kwargs):
106 |         requires_sentencepiece(self)
107 | 
108 |     @classmethod
109 |     def from_pretrained(self, *args, **kwargs):
110 |         requires_sentencepiece(self)
111 | 
112 | 
113 | class XLNetTokenizer:
114 |     def __init__(self, *args, **kwargs):
115 |         requires_sentencepiece(self)
116 | 
117 |     @classmethod
118 |     def from_pretrained(self, *args, **kwargs):
119 |         requires_sentencepiece(self)
120 | 


--------------------------------------------------------------------------------
/transformers/activations.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import math
16 | 
17 | import torch
18 | import torch.nn.functional as F
19 | from packaging import version
20 | 
21 | from .utils import logging
22 | 
23 | 
24 | logger = logging.get_logger(__name__)
25 | 
26 | 
27 | def _gelu_python(x):
28 |     """
29 |     Original Implementation of the GELU activation function in Google BERT repo when initially created. For
30 |     information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
31 |     torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in
32 |     torch.nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
33 |     """
34 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
35 | 
36 | 
37 | def gelu_new(x):
38 |     """
39 |     Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
40 |     the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
41 |     """
42 |     return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
43 | 
44 | 
45 | if version.parse(torch.__version__) < version.parse("1.4"):
46 |     gelu = _gelu_python
47 | else:
48 |     gelu = F.gelu
49 | 
50 | 
51 | def gelu_fast(x):
52 |     return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
53 | 
54 | 
55 | def _silu_python(x):
56 |     """
57 |     See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
58 |     Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
59 |     Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
60 |     Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
61 |     later.
62 |     """
63 |     return x * torch.sigmoid(x)
64 | 
65 | 
66 | if version.parse(torch.__version__) < version.parse("1.7"):
67 |     silu = _silu_python
68 | else:
69 |     silu = F.silu
70 | 
71 | 
72 | def mish(x):
73 |     return x * torch.tanh(torch.nn.functional.softplus(x))
74 | 
75 | 
76 | def linear_act(x):
77 |     return x
78 | 
79 | 
80 | ACT2FN = {
81 |     "relu": F.relu,
82 |     "silu": silu,
83 |     "swish": silu,
84 |     "gelu": gelu,
85 |     "tanh": torch.tanh,
86 |     "gelu_new": gelu_new,
87 |     "gelu_fast": gelu_fast,
88 |     "mish": mish,
89 |     "linear": linear_act,
90 |     "sigmoid": torch.sigmoid,
91 | }
92 | 
93 | 
94 | def get_activation(activation_string):
95 |     if activation_string in ACT2FN:
96 |         return ACT2FN[activation_string]
97 |     else:
98 |         raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys())))
99 | 


--------------------------------------------------------------------------------
/transformers/pipelines/text_classification.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available
 4 | from .base import PIPELINE_INIT_ARGS, Pipeline
 5 | 
 6 | 
 7 | if is_tf_available():
 8 |     from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
 9 | 
10 | if is_torch_available():
11 |     from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
12 | 
13 | 
14 | @add_end_docstrings(
15 |     PIPELINE_INIT_ARGS,
16 |     r"""
17 |         return_all_scores (:obj:`bool`, `optional`, defaults to :obj:`False`):
18 |             Whether to return all prediction scores or just the one of the predicted class.
19 |     """,
20 | )
21 | class TextClassificationPipeline(Pipeline):
22 |     """
23 |     Text classification pipeline using any :obj:`ModelForSequenceClassification`. See the `sequence classification
24 |     examples <../task_summary.html#sequence-classification>`__ for more information.
25 | 
26 |     This text classification pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
27 |     task identifier: :obj:`"sentiment-analysis"` (for classifying sequences according to positive or negative
28 |     sentiments).
29 | 
30 |     If multiple classification labels are available (:obj:`model.config.num_labels >= 2`), the pipeline will run a
31 |     softmax over the results. If there is a single label, the pipeline will run a sigmoid over the result.
32 | 
33 |     The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
34 |     the up-to-date list of available models on `huggingface.co/models
35 |     <https://huggingface.co/models?filter=text-classification>`__.
36 |     """
37 | 
38 |     def __init__(self, return_all_scores: bool = False, **kwargs):
39 |         super().__init__(**kwargs)
40 | 
41 |         self.check_model_type(
42 |             TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
43 |             if self.framework == "tf"
44 |             else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
45 |         )
46 | 
47 |         self.return_all_scores = return_all_scores
48 | 
49 |     def __call__(self, *args, **kwargs):
50 |         """
51 |         Classify the text(s) given as inputs.
52 | 
53 |         Args:
54 |             args (:obj:`str` or :obj:`List[str]`):
55 |                 One or several texts (or one list of prompts) to classify.
56 | 
57 |         Return:
58 |             A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
59 | 
60 |             - **label** (:obj:`str`) -- The label predicted.
61 |             - **score** (:obj:`float`) -- The corresponding probability.
62 | 
63 |             If ``self.return_all_scores=True``, one such dictionary is returned per label.
64 |         """
65 |         outputs = super().__call__(*args, **kwargs)
66 | 
67 |         if self.model.config.num_labels == 1:
68 |             scores = 1.0 / (1.0 + np.exp(-outputs))
69 |         else:
70 |             scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
71 |         if self.return_all_scores:
72 |             return [
73 |                 [{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(item)]
74 |                 for item in scores
75 |             ]
76 |         else:
77 |             return [
78 |                 {"label": self.model.config.id2label[item.argmax()], "score": item.max().item()} for item in scores
79 |             ]
80 | 


--------------------------------------------------------------------------------
/transformers/models/lxmert/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_tf_available, is_tokenizers_available, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_lxmert": ["LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LxmertConfig"],
26 |     "tokenization_lxmert": ["LxmertTokenizer"],
27 | }
28 | 
29 | if is_tokenizers_available():
30 |     _import_structure["tokenization_lxmert_fast"] = ["LxmertTokenizerFast"]
31 | 
32 | if is_torch_available():
33 |     _import_structure["modeling_lxmert"] = [
34 |         "LxmertEncoder",
35 |         "LxmertForPreTraining",
36 |         "LxmertForQuestionAnswering",
37 |         "LxmertModel",
38 |         "LxmertPreTrainedModel",
39 |         "LxmertVisualFeatureEncoder",
40 |         "LxmertXLayer",
41 |     ]
42 | 
43 | if is_tf_available():
44 |     _import_structure["modeling_tf_lxmert"] = [
45 |         "TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST",
46 |         "TFLxmertForPreTraining",
47 |         "TFLxmertMainLayer",
48 |         "TFLxmertModel",
49 |         "TFLxmertPreTrainedModel",
50 |         "TFLxmertVisualFeatureEncoder",
51 |     ]
52 | 
53 | 
54 | if TYPE_CHECKING:
55 |     from .configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig
56 |     from .tokenization_lxmert import LxmertTokenizer
57 | 
58 |     if is_tokenizers_available():
59 |         from .tokenization_lxmert_fast import LxmertTokenizerFast
60 | 
61 |     if is_torch_available():
62 |         from .modeling_lxmert import (
63 |             LxmertEncoder,
64 |             LxmertForPreTraining,
65 |             LxmertForQuestionAnswering,
66 |             LxmertModel,
67 |             LxmertPreTrainedModel,
68 |             LxmertVisualFeatureEncoder,
69 |             LxmertXLayer,
70 |         )
71 | 
72 |     if is_tf_available():
73 |         from .modeling_tf_lxmert import (
74 |             TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST,
75 |             TFLxmertForPreTraining,
76 |             TFLxmertMainLayer,
77 |             TFLxmertModel,
78 |             TFLxmertPreTrainedModel,
79 |             TFLxmertVisualFeatureEncoder,
80 |         )
81 | 
82 | else:
83 |     import importlib
84 |     import os
85 |     import sys
86 | 
87 |     class _LazyModule(_BaseLazyModule):
88 |         """
89 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
90 |         """
91 | 
92 |         __file__ = globals()["__file__"]
93 |         __path__ = [os.path.dirname(__file__)]
94 | 
95 |         def _get_module(self, module_name: str):
96 |             return importlib.import_module("." + module_name, self.__name__)
97 | 
98 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
99 | 


--------------------------------------------------------------------------------
/transformers/models/transfo_xl/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from typing import TYPE_CHECKING
20 | 
21 | from ...file_utils import _BaseLazyModule, is_tf_available, is_torch_available
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_transfo_xl": ["TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP", "TransfoXLConfig"],
26 |     "tokenization_transfo_xl": ["TransfoXLCorpus", "TransfoXLTokenizer"],
27 | }
28 | 
29 | if is_torch_available():
30 |     _import_structure["modeling_transfo_xl"] = [
31 |         "TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
32 |         "AdaptiveEmbedding",
33 |         "TransfoXLForSequenceClassification",
34 |         "TransfoXLLMHeadModel",
35 |         "TransfoXLModel",
36 |         "TransfoXLPreTrainedModel",
37 |         "load_tf_weights_in_transfo_xl",
38 |     ]
39 | 
40 | if is_tf_available():
41 |     _import_structure["modeling_tf_transfo_xl"] = [
42 |         "TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
43 |         "TFAdaptiveEmbedding",
44 |         "TFTransfoXLForSequenceClassification",
45 |         "TFTransfoXLLMHeadModel",
46 |         "TFTransfoXLMainLayer",
47 |         "TFTransfoXLModel",
48 |         "TFTransfoXLPreTrainedModel",
49 |     ]
50 | 
51 | 
52 | if TYPE_CHECKING:
53 |     from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
54 |     from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer
55 | 
56 |     if is_torch_available():
57 |         from .modeling_transfo_xl import (
58 |             TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
59 |             AdaptiveEmbedding,
60 |             TransfoXLForSequenceClassification,
61 |             TransfoXLLMHeadModel,
62 |             TransfoXLModel,
63 |             TransfoXLPreTrainedModel,
64 |             load_tf_weights_in_transfo_xl,
65 |         )
66 | 
67 |     if is_tf_available():
68 |         from .modeling_tf_transfo_xl import (
69 |             TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
70 |             TFAdaptiveEmbedding,
71 |             TFTransfoXLForSequenceClassification,
72 |             TFTransfoXLLMHeadModel,
73 |             TFTransfoXLMainLayer,
74 |             TFTransfoXLModel,
75 |             TFTransfoXLPreTrainedModel,
76 |         )
77 | 
78 | else:
79 |     import importlib
80 |     import os
81 |     import sys
82 | 
83 |     class _LazyModule(_BaseLazyModule):
84 |         """
85 |         Module class that surfaces all objects but only performs associated imports when the objects are requested.
86 |         """
87 | 
88 |         __file__ = globals()["__file__"]
89 |         __path__ = [os.path.dirname(__file__)]
90 | 
91 |         def _get_module(self, module_name: str):
92 |             return importlib.import_module("." + module_name, self.__name__)
93 | 
94 |     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
95 | 


--------------------------------------------------------------------------------