├── transformers ├── transformers │ ├── tests │ │ ├── __init__.py │ │ ├── fixtures │ │ │ ├── input.txt │ │ │ ├── test_sentencepiece.model │ │ │ └── sample_text.txt │ │ ├── conftest.py │ │ ├── tokenization_auto_test.py │ │ ├── tokenization_utils_test.py │ │ ├── tokenization_distilbert_test.py │ │ ├── configuration_common_test.py │ │ ├── tokenization_openai_test.py │ │ ├── tokenization_ctrl_test.py │ │ ├── tokenization_gpt2_test.py │ │ ├── tokenization_transfo_xl_test.py │ │ ├── tokenization_xlm_test.py │ │ ├── tokenization_roberta_test.py │ │ ├── modeling_tf_auto_test.py │ │ └── modeling_auto_test.py │ ├── data │ │ ├── processors │ │ │ ├── __init__.py │ │ │ └── utils.py │ │ ├── __init__.py │ │ └── metrics │ │ │ └── __init__.py │ ├── configuration_roberta.py │ ├── tokenization_distilbert.py │ ├── convert_bert_original_tf_checkpoint_to_pytorch.py │ ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py │ ├── convert_openai_original_tf_checkpoint_to_pytorch.py │ ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py │ ├── configuration_distilbert.py │ ├── log_utils.py │ ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py │ └── convert_bert_pytorch_checkpoint_to_original_tf.py ├── MANIFEST.in ├── docs │ ├── source │ │ ├── examples.md │ │ ├── _static │ │ │ └── css │ │ │ │ ├── Calibre-Thin.otf │ │ │ │ ├── Calibre-Light.ttf │ │ │ │ ├── Calibre-Medium.otf │ │ │ │ ├── Calibre-Regular.otf │ │ │ │ └── code-snippets.css │ │ ├── imgs │ │ │ ├── transformers_logo_name.png │ │ │ ├── warmup_constant_schedule.png │ │ │ ├── warmup_cosine_schedule.png │ │ │ ├── warmup_linear_schedule.png │ │ │ ├── warmup_cosine_hard_restarts_schedule.png │ │ │ └── warmup_cosine_warm_restarts_schedule.png │ │ ├── main_classes │ │ │ ├── configuration.rst │ │ │ ├── model.rst │ │ │ ├── tokenizer.rst │ │ │ ├── optimizer_schedules.rst │ │ │ └── processors.rst │ │ ├── model_doc │ │ │ ├── ctrl.rst │ │ │ ├── transformerxl.rst │ │ │ ├── auto.rst │ │ │ ├── gpt2.rst │ │ │ ├── roberta.rst │ │ │ ├── gpt.rst │ │ │ ├── xlm.rst │ │ │ ├── distilbert.rst │ │ │ ├── xlnet.rst │ │ │ └── bert.rst │ │ ├── bertology.rst │ │ ├── notebooks.rst │ │ ├── installation.md │ │ ├── benchmarks.md │ │ ├── multilingual.rst │ │ └── converting_tensorflow_models.rst │ ├── Makefile │ ├── requirements.txt │ └── README.md ├── examples │ ├── requirements.txt │ ├── tests_samples │ │ ├── .gitignore │ │ └── MRPC │ │ │ ├── dev.tsv │ │ │ └── train.tsv │ ├── distillation │ │ ├── requirements.txt │ │ ├── training_configs │ │ │ ├── distilgpt2.json │ │ │ └── distilbert-base-uncased.json │ │ ├── scripts │ │ │ ├── token_counts.py │ │ │ ├── binarized_data.py │ │ │ ├── extract_distilbert.py │ │ │ └── extract.py │ │ ├── grouped_batch_sampler.py │ │ └── utils.py │ ├── contrib │ │ └── README.md │ ├── adversarial │ │ ├── RUN_HANS.bash │ │ ├── README.md │ │ └── utils_hans.py │ ├── new_get_conll.py │ ├── get_conll.py │ ├── lm_training │ │ ├── fp16_utils.py │ │ └── dist_comms.py │ ├── run_tf_glue.py │ ├── create_dataset.py │ ├── test_examples.py │ └── parallel.py ├── docker │ └── Dockerfile ├── .coveragerc ├── requirements.txt ├── .github │ ├── ISSUE_TEMPLATE │ │ ├── question-help.md │ │ ├── ---new-benchmark.md │ │ ├── --new-model-addition.md │ │ ├── feature-request.md │ │ ├── bug-report.md │ │ └── migration.md │ └── stale.yml ├── requirements-dev.txt ├── .gitignore ├── setup.py └── .circleci │ └── config.yml └── README.md /transformers/transformers/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /transformers/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | -------------------------------------------------------------------------------- /transformers/docs/source/examples.md: -------------------------------------------------------------------------------- 1 | ../../examples/README.md -------------------------------------------------------------------------------- /transformers/examples/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboardX 2 | tensorboard 3 | scikit-learn 4 | seqeval 5 | -------------------------------------------------------------------------------- /transformers/transformers/tests/fixtures/input.txt: -------------------------------------------------------------------------------- 1 | Who was Jim Henson ? ||| Jim Henson was a puppeteer 2 | -------------------------------------------------------------------------------- /transformers/examples/tests_samples/.gitignore: -------------------------------------------------------------------------------- 1 | *.* 2 | cache* 3 | temp* 4 | !*.tsv 5 | !*.json 6 | !.gitignore -------------------------------------------------------------------------------- /transformers/docs/source/_static/css/Calibre-Thin.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arianhosseini/negation-learning/HEAD/transformers/docs/source/_static/css/Calibre-Thin.otf -------------------------------------------------------------------------------- /transformers/docs/source/_static/css/Calibre-Light.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arianhosseini/negation-learning/HEAD/transformers/docs/source/_static/css/Calibre-Light.ttf -------------------------------------------------------------------------------- /transformers/docs/source/_static/css/Calibre-Medium.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arianhosseini/negation-learning/HEAD/transformers/docs/source/_static/css/Calibre-Medium.otf -------------------------------------------------------------------------------- /transformers/examples/distillation/requirements.txt: -------------------------------------------------------------------------------- 1 | gitpython==3.0.2 2 | tensorboard>=1.14.0 3 | tensorboardX==1.8 4 | psutil==5.6.3 5 | scipy==1.3.1 6 | transformers==2.0.0 7 | -------------------------------------------------------------------------------- /transformers/docs/source/_static/css/Calibre-Regular.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arianhosseini/negation-learning/HEAD/transformers/docs/source/_static/css/Calibre-Regular.otf -------------------------------------------------------------------------------- /transformers/docs/source/imgs/transformers_logo_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arianhosseini/negation-learning/HEAD/transformers/docs/source/imgs/transformers_logo_name.png -------------------------------------------------------------------------------- /transformers/docs/source/imgs/warmup_constant_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arianhosseini/negation-learning/HEAD/transformers/docs/source/imgs/warmup_constant_schedule.png -------------------------------------------------------------------------------- /transformers/docs/source/imgs/warmup_cosine_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arianhosseini/negation-learning/HEAD/transformers/docs/source/imgs/warmup_cosine_schedule.png -------------------------------------------------------------------------------- /transformers/docs/source/imgs/warmup_linear_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arianhosseini/negation-learning/HEAD/transformers/docs/source/imgs/warmup_linear_schedule.png -------------------------------------------------------------------------------- /transformers/transformers/tests/fixtures/test_sentencepiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arianhosseini/negation-learning/HEAD/transformers/transformers/tests/fixtures/test_sentencepiece.model -------------------------------------------------------------------------------- /transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arianhosseini/negation-learning/HEAD/transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png -------------------------------------------------------------------------------- /transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arianhosseini/negation-learning/HEAD/transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png -------------------------------------------------------------------------------- /transformers/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:latest 2 | 3 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext 4 | 5 | RUN pip install transformers 6 | 7 | WORKDIR /workspace -------------------------------------------------------------------------------- /transformers/transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import InputExample, InputFeatures, DataProcessor 2 | from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features 3 | 4 | -------------------------------------------------------------------------------- /transformers/examples/distillation/training_configs/distilgpt2.json: -------------------------------------------------------------------------------- 1 | { 2 | "initializer_range": 0.02, 3 | "layer_norm_epsilon": 0.00001, 4 | "n_ctx": 1024, 5 | "n_embd": 768, 6 | "n_head": 12, 7 | "n_layer": 6, 8 | "n_positions": 1024, 9 | "vocab_size": 50257 10 | } -------------------------------------------------------------------------------- /transformers/.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source=transformers 3 | omit = 4 | # skip convertion scripts from testing for now 5 | */convert_* 6 | */__main__.py 7 | [report] 8 | exclude_lines = 9 | pragma: no cover 10 | raise 11 | except 12 | register_parameter -------------------------------------------------------------------------------- /transformers/requirements.txt: -------------------------------------------------------------------------------- 1 | # progress bars in model download and training scripts 2 | tqdm 3 | # Accessing files from S3 directly. 4 | boto3 5 | # Used for downloading models over HTTP 6 | requests 7 | # For OpenAI GPT 8 | regex 9 | # For XLNet 10 | sentencepiece 11 | # For XLM 12 | sacremoses -------------------------------------------------------------------------------- /transformers/.github/ISSUE_TEMPLATE/question-help.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "❓Questions & Help" 3 | about: Start a general discussion related to PyTorch Transformers 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## ❓ Questions & Help 11 | 12 | 13 | -------------------------------------------------------------------------------- /transformers/examples/contrib/README.md: -------------------------------------------------------------------------------- 1 | # Community contributed examples 2 | 3 | This folder contains examples which are not actively maintained (mostly contributed by the community). 4 | 5 | Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working. 6 | -------------------------------------------------------------------------------- /transformers/docs/source/_static/css/code-snippets.css: -------------------------------------------------------------------------------- 1 | 2 | .highlight .c1, .highlight .sd{ 3 | color: #999 4 | } 5 | 6 | .highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc { 7 | color: #FB8D68; 8 | } 9 | 10 | .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow { 11 | color: #6670FF; 12 | } -------------------------------------------------------------------------------- /transformers/transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .processors import InputExample, InputFeatures, DataProcessor 2 | from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features 3 | 4 | from .metrics import is_sklearn_available 5 | if is_sklearn_available(): 6 | from .metrics import glue_compute_metrics 7 | -------------------------------------------------------------------------------- /transformers/examples/distillation/training_configs/distilbert-base-uncased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 30522 14 | } 15 | -------------------------------------------------------------------------------- /transformers/examples/adversarial/RUN_HANS.bash: -------------------------------------------------------------------------------- 1 | export HANS_DIR=~/hans 2 | export MODEL_TYPE=bert 3 | export MODEL_PATH=/workdrive/test/glue_results/MNLI/ 4 | 5 | python test_hans.py \ 6 | --task_name hans \ 7 | --model_type $MODEL_TYPE \ 8 | --do_eval \ 9 | --data_dir $HANS_DIR \ 10 | --model_name_or_path $MODEL_PATH \ 11 | --max_seq_length 128 \ 12 | --output_dir $MODEL_PATH \ 13 | -------------------------------------------------------------------------------- /transformers/docs/source/main_classes/configuration.rst: -------------------------------------------------------------------------------- 1 | Configuration 2 | ---------------------------------------------------- 3 | 4 | The base class ``PretrainedConfig`` implements the common methods for loading/saving a configuration either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository). 5 | 6 | ``PretrainedConfig`` 7 | ~~~~~~~~~~~~~~~~~~~~~ 8 | 9 | .. autoclass:: transformers.PretrainedConfig 10 | :members: 11 | -------------------------------------------------------------------------------- /transformers/.github/ISSUE_TEMPLATE/---new-benchmark.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F5A5 New Benchmark" 3 | about: You benchmark a part of this library and would like to share your results 4 | title: "[Benchmark]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # Benchmarking Transformers 11 | 12 | ## Benchmark 13 | 14 | Which part of Transformers did you benchmark? 15 | 16 | ## Set-up 17 | 18 | What did you run your benchmarks on? Please include details, such as: CPU, GPU? If using multiple GPUs, which parallelization did you use? 19 | 20 | ## Results 21 | 22 | Put your results here! 23 | -------------------------------------------------------------------------------- /transformers/.github/ISSUE_TEMPLATE/--new-model-addition.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F31FNew model addition" 3 | about: Submit a proposal/request to implement a new Transformer-based model 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # 🌟New model addition 11 | 12 | ## Model description 13 | 14 | 15 | 16 | ## Open Source status 17 | 18 | * [ ] the model implementation is available: (give details) 19 | * [ ] the model weights are available: (give details) 20 | 21 | ## Additional context 22 | 23 | 24 | -------------------------------------------------------------------------------- /transformers/transformers/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # content of conftest.py 2 | 3 | import pytest 4 | 5 | 6 | def pytest_addoption(parser): 7 | parser.addoption( 8 | "--runslow", action="store_true", default=False, help="run slow tests" 9 | ) 10 | 11 | 12 | def pytest_collection_modifyitems(config, items): 13 | if config.getoption("--runslow"): 14 | # --runslow given in cli: do not skip slow tests 15 | return 16 | skip_slow = pytest.mark.skip(reason="need --runslow option to run") 17 | for item in items: 18 | if "slow" in item.keywords: 19 | item.add_marker(skip_slow) 20 | -------------------------------------------------------------------------------- /transformers/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /transformers/.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F680 Feature Request" 3 | about: Submit a proposal/request for a new PyTorch Transformers feature 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## 🚀 Feature 11 | 12 | 13 | 14 | ## Motivation 15 | 16 | 17 | 18 | ## Additional context 19 | 20 | 21 | -------------------------------------------------------------------------------- /transformers/.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 60 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: wontfix 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false -------------------------------------------------------------------------------- /transformers/docs/requirements.txt: -------------------------------------------------------------------------------- 1 | alabaster==0.7.12 2 | Babel==2.7.0 3 | certifi==2019.6.16 4 | chardet==3.0.4 5 | commonmark==0.9.0 6 | docutils==0.14 7 | future==0.17.1 8 | idna==2.8 9 | imagesize==1.1.0 10 | Jinja2==2.10.1 11 | MarkupSafe==1.1.1 12 | packaging==19.0 13 | Pygments==2.4.2 14 | pyparsing==2.4.0 15 | pytz==2019.1 16 | recommonmark==0.5.0 17 | requests==2.22.0 18 | six==1.12.0 19 | snowballstemmer==1.9.0 20 | Sphinx==2.1.2 21 | sphinx-rtd-theme==0.4.3 22 | sphinxcontrib-applehelp==1.0.1 23 | sphinxcontrib-devhelp==1.0.1 24 | sphinxcontrib-htmlhelp==1.0.2 25 | sphinxcontrib-jsmath==1.0.1 26 | sphinxcontrib-qthelp==1.0.2 27 | sphinxcontrib-serializinghtml==1.1.3 28 | urllib3==1.25.3 29 | sphinx-markdown-tables==0.0.9 30 | numpy==1.17.2 31 | tensorflow==2.0.0rc2 32 | torch==1.2.0 -------------------------------------------------------------------------------- /transformers/docs/source/main_classes/model.rst: -------------------------------------------------------------------------------- 1 | Models 2 | ---------------------------------------------------- 3 | 4 | The base class ``PreTrainedModel`` implements the common methods for loading/saving a model either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository). 5 | 6 | ``PreTrainedModel`` also implements a few methods which are common among all the models to: 7 | 8 | - resize the input token embeddings when new tokens are added to the vocabulary 9 | - prune the attention heads of the model. 10 | 11 | ``PreTrainedModel`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.PreTrainedModel 15 | :members: 16 | 17 | ``TFPreTrainedModel`` 18 | ~~~~~~~~~~~~~~~~~~~~~ 19 | 20 | .. autoclass:: transformers.TFPreTrainedModel 21 | :members: 22 | -------------------------------------------------------------------------------- /transformers/docs/source/model_doc/ctrl.rst: -------------------------------------------------------------------------------- 1 | CTRL 2 | ---------------------------------------------------- 3 | 4 | ``CTRLConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.CTRLConfig 8 | :members: 9 | 10 | 11 | ``CTRLTokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.CTRLTokenizer 15 | :members: 16 | 17 | 18 | ``CTRLModel`` 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.CTRLModel 22 | :members: 23 | 24 | 25 | ``CTRLLMHeadModel`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.CTRLLMHeadModel 29 | :members: 30 | 31 | 32 | ``TFCTRLModel`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.TFCTRLModel 36 | :members: 37 | 38 | 39 | ``TFCTRLLMHeadModel`` 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.TFCTRLLMHeadModel 43 | :members: 44 | 45 | -------------------------------------------------------------------------------- /transformers/docs/source/main_classes/tokenizer.rst: -------------------------------------------------------------------------------- 1 | Tokenizer 2 | ---------------------------------------------------- 3 | 4 | The base class ``PreTrainedTokenizer`` implements the common methods for loading/saving a tokenizer either from a local file or directory, or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository). 5 | 6 | ``PreTrainedTokenizer`` is the main entry point into tokenizers as it also implements the main methods for using all the tokenizers: 7 | 8 | - tokenizing, converting tokens to ids and back and encoding/decoding, 9 | - adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...), 10 | - managing special tokens (adding them, assigning them to roles, making sure they are not split during tokenization) 11 | 12 | ``PreTrainedTokenizer`` 13 | ~~~~~~~~~~~~~~~~~~~~~~~~ 14 | 15 | .. autoclass:: transformers.PreTrainedTokenizer 16 | :members: 17 | -------------------------------------------------------------------------------- /transformers/docs/source/model_doc/transformerxl.rst: -------------------------------------------------------------------------------- 1 | Transformer XL 2 | ---------------------------------------------------- 3 | 4 | 5 | ``TransfoXLConfig`` 6 | ~~~~~~~~~~~~~~~~~~~~~ 7 | 8 | .. autoclass:: transformers.TransfoXLConfig 9 | :members: 10 | 11 | 12 | ``TransfoXLTokenizer`` 13 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 14 | 15 | .. autoclass:: transformers.TransfoXLTokenizer 16 | :members: 17 | 18 | 19 | ``TransfoXLModel`` 20 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 21 | 22 | .. autoclass:: transformers.TransfoXLModel 23 | :members: 24 | 25 | 26 | ``TransfoXLLMHeadModel`` 27 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 28 | 29 | .. autoclass:: transformers.TransfoXLLMHeadModel 30 | :members: 31 | 32 | 33 | ``TFTransfoXLModel`` 34 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 35 | 36 | .. autoclass:: transformers.TFTransfoXLModel 37 | :members: 38 | 39 | 40 | ``TFTransfoXLLMHeadModel`` 41 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 42 | 43 | .. autoclass:: transformers.TFTransfoXLLMHeadModel 44 | :members: 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Negation-Learning 3 | 4 | # train BERTNOT 5 | 6 | ``` 7 | python -m torch.distributed.launch --nproc_per_node=1 \ examples/lm_training/finetune_on_pregenerated_negation_distributed.py\ 8 | --pregenerated_neg_data path/to/wiki20k_negated_withref_UL_pregenerated/ \ 9 | --pregenerated_pos_data path/to/wiki20k_positive_withref_LL_pregenerated/ \ 10 | --validation_neg_data path/to/validation/neg/ \ 11 | --validation_pos_data path/to/validation/pos/ \ 12 | --pregenerated_data path/to/lm_training_wiki20k/ \ 13 | --output_dir path/to/output \ 14 | --bert_model bert-base-cased \ 15 | --fp16 \ 16 | --exp_group neg_sb_gamma0.0_lr1e_5_e5_wiki20k \ 17 | --learning_rate 1e-5 \ 18 | --port_idx 0 \ 19 | --kr_freq 0.0 \ 20 | --train_batch_size 32 \ 21 | --mlm_freq 0.0 \ 22 | --epoch 5 \ 23 | --gamma 0.0 \ 24 | --seed $seed \ 25 | --method neg_samebatch 26 | ``` 27 | 28 | Data can be downloaded from [here](https://zenodo.org/record/4737796). 29 | -------------------------------------------------------------------------------- /transformers/docs/source/model_doc/auto.rst: -------------------------------------------------------------------------------- 1 | AutoModels 2 | ----------- 3 | 4 | In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method. 5 | 6 | AutoClasses are here to do this job for you so that you automatically retreive the relevant model given the name/path to the pretrained weights/config/vocabulary: 7 | 8 | Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``). 9 | 10 | 11 | ``AutoConfig`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.AutoConfig 15 | :members: 16 | 17 | 18 | ``AutoModel`` 19 | ~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.AutoModel 22 | :members: 23 | 24 | 25 | ``AutoTokenizer`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.AutoTokenizer 29 | :members: 30 | -------------------------------------------------------------------------------- /transformers/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.8.0 2 | astor==0.8.0 3 | atomicwrites==1.3.0 4 | attrs==19.2.0 5 | boto3==1.9.243 6 | botocore==1.12.243 7 | certifi==2019.9.11 8 | chardet==3.0.4 9 | Click==7.0 10 | docutils==0.15.2 11 | gast==0.2.2 12 | google-pasta==0.1.7 13 | grpcio==1.24.1 14 | h5py==2.10.0 15 | idna==2.8 16 | importlib-metadata==0.23 17 | jmespath==0.9.4 18 | joblib==0.14.0 19 | Keras-Applications==1.0.8 20 | Keras-Preprocessing==1.1.0 21 | Markdown==3.1.1 22 | more-itertools==7.2.0 23 | numpy==1.17.2 24 | opt-einsum==3.1.0 25 | packaging==19.2 26 | pluggy==0.13.0 27 | protobuf==3.10.0 28 | py==1.8.0 29 | pyparsing==2.4.2 30 | pytest==5.2.1 31 | python-dateutil==2.8.0 32 | regex==2019.8.19 33 | requests==2.22.0 34 | s3transfer==0.2.1 35 | sacremoses==0.0.35 36 | sentencepiece==0.1.83 37 | six==1.12.0 38 | tensorboard==2.0.0 39 | tensorflow==2.0.0 40 | tensorflow-estimator==2.0.0 41 | termcolor==1.1.0 42 | torch==1.2.0 43 | tqdm==4.36.1 44 | urllib3==1.25.6 45 | wcwidth==0.1.7 46 | Werkzeug==0.16.0 47 | wrapt==1.11.2 48 | zipp==0.6.0 49 | -------------------------------------------------------------------------------- /transformers/docs/source/model_doc/gpt2.rst: -------------------------------------------------------------------------------- 1 | OpenAI GPT2 2 | ---------------------------------------------------- 3 | 4 | ``GPT2Config`` 5 | ~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.GPT2Config 8 | :members: 9 | 10 | 11 | ``GPT2Tokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.GPT2Tokenizer 15 | :members: 16 | 17 | 18 | ``GPT2Model`` 19 | ~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.GPT2Model 22 | :members: 23 | 24 | 25 | ``GPT2LMHeadModel`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.GPT2LMHeadModel 29 | :members: 30 | 31 | 32 | ``GPT2DoubleHeadsModel`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.GPT2DoubleHeadsModel 36 | :members: 37 | 38 | 39 | ``TFGPT2Model`` 40 | ~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.TFGPT2Model 43 | :members: 44 | 45 | 46 | ``TFGPT2LMHeadModel`` 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | .. autoclass:: transformers.TFGPT2LMHeadModel 50 | :members: 51 | 52 | 53 | ``TFGPT2DoubleHeadsModel`` 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | .. autoclass:: transformers.TFGPT2DoubleHeadsModel 57 | :members: 58 | -------------------------------------------------------------------------------- /transformers/docs/source/model_doc/roberta.rst: -------------------------------------------------------------------------------- 1 | RoBERTa 2 | ---------------------------------------------------- 3 | 4 | ``RobertaConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.RobertaConfig 8 | :members: 9 | 10 | 11 | ``RobertaTokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.RobertaTokenizer 15 | :members: 16 | 17 | 18 | ``RobertaModel`` 19 | ~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.RobertaModel 22 | :members: 23 | 24 | 25 | ``RobertaForMaskedLM`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.RobertaForMaskedLM 29 | :members: 30 | 31 | 32 | ``RobertaForSequenceClassification`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.RobertaForSequenceClassification 36 | :members: 37 | 38 | 39 | ``TFRobertaModel`` 40 | ~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.TFRobertaModel 43 | :members: 44 | 45 | 46 | ``TFRobertaForMaskedLM`` 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | .. autoclass:: transformers.TFRobertaForMaskedLM 50 | :members: 51 | 52 | 53 | ``TFRobertaForSequenceClassification`` 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | .. autoclass:: transformers.TFRobertaForSequenceClassification 57 | :members: 58 | -------------------------------------------------------------------------------- /transformers/docs/source/model_doc/gpt.rst: -------------------------------------------------------------------------------- 1 | OpenAI GPT 2 | ---------------------------------------------------- 3 | 4 | ``OpenAIGPTConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.OpenAIGPTConfig 8 | :members: 9 | 10 | 11 | ``OpenAIGPTTokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.OpenAIGPTTokenizer 15 | :members: 16 | 17 | 18 | ``OpenAIGPTModel`` 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.OpenAIGPTModel 22 | :members: 23 | 24 | 25 | ``OpenAIGPTLMHeadModel`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.OpenAIGPTLMHeadModel 29 | :members: 30 | 31 | 32 | ``OpenAIGPTDoubleHeadsModel`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.OpenAIGPTDoubleHeadsModel 36 | :members: 37 | 38 | 39 | ``TFOpenAIGPTModel`` 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.TFOpenAIGPTModel 43 | :members: 44 | 45 | 46 | ``TFOpenAIGPTLMHeadModel`` 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | .. autoclass:: transformers.TFOpenAIGPTLMHeadModel 50 | :members: 51 | 52 | 53 | ``TFOpenAIGPTDoubleHeadsModel`` 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | .. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel 57 | :members: 58 | -------------------------------------------------------------------------------- /transformers/.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F41B Bug Report" 3 | about: Submit a bug report to help us improve PyTorch Transformers 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## 🐛 Bug 11 | 12 | 13 | 14 | Model I am using (Bert, XLNet....): 15 | 16 | Language I am using the model on (English, Chinese....): 17 | 18 | The problem arise when using: 19 | * [ ] the official example scripts: (give details) 20 | * [ ] my own modified scripts: (give details) 21 | 22 | The tasks I am working on is: 23 | * [ ] an official GLUE/SQUaD task: (give the name) 24 | * [ ] my own task or dataset: (give details) 25 | 26 | ## To Reproduce 27 | 28 | Steps to reproduce the behavior: 29 | 30 | 1. 31 | 2. 32 | 3. 33 | 34 | 35 | 36 | ## Expected behavior 37 | 38 | 39 | 40 | ## Environment 41 | 42 | * OS: 43 | * Python version: 44 | * PyTorch version: 45 | * PyTorch Transformers version (or branch): 46 | * Using GPU ? 47 | * Distributed of parallel setup ? 48 | * Any other relevant information: 49 | 50 | ## Additional context 51 | 52 | 53 | -------------------------------------------------------------------------------- /transformers/.github/ISSUE_TEMPLATE/migration.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F4DA Migration from PyTorch-pretrained-Bert" 3 | about: Report a problem when migrating from PyTorch-pretrained-Bert to Transformers 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## 📚 Migration 11 | 12 | 13 | 14 | Model I am using (Bert, XLNet....): 15 | 16 | Language I am using the model on (English, Chinese....): 17 | 18 | The problem arise when using: 19 | * [ ] the official example scripts: (give details) 20 | * [ ] my own modified scripts: (give details) 21 | 22 | The tasks I am working on is: 23 | * [ ] an official GLUE/SQUaD task: (give the name) 24 | * [ ] my own task or dataset: (give details) 25 | 26 | Details of the issue: 27 | 28 | 29 | 30 | ## Environment 31 | 32 | * OS: 33 | * Python version: 34 | * PyTorch version: 35 | * PyTorch Transformers version (or branch): 36 | * Using GPU ? 37 | * Distributed of parallel setup ? 38 | * Any other relevant information: 39 | 40 | ## Checklist 41 | 42 | - [ ] I have read the migration guide in the readme. 43 | - [ ] I checked if a related official extension example runs on my machine. 44 | 45 | ## Additional context 46 | 47 | 48 | -------------------------------------------------------------------------------- /transformers/examples/tests_samples/MRPC/dev.tsv: -------------------------------------------------------------------------------- 1 | Quality #1 ID #2 ID #1 String #2 String 2 | 1 1355540 1355592 He said the foodservice pie business doesn 't fit the company 's long-term growth strategy . " The foodservice pie business does not fit our long-term growth strategy . 3 | 0 2029631 2029565 Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war . His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war . 4 | 0 487993 487952 The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat . The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent . 5 | 1 1989515 1989458 The AFL-CIO is waiting until October to decide if it will endorse a candidate . The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries . 6 | 0 1783137 1782659 No dates have been set for the civil or the criminal trial . No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty . 7 | 1 3039165 3039036 Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed . It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status . 8 | -------------------------------------------------------------------------------- /transformers/examples/tests_samples/MRPC/train.tsv: -------------------------------------------------------------------------------- 1 | Quality #1 ID #2 ID #1 String #2 String 2 | 1 1355540 1355592 He said the foodservice pie business doesn 't fit the company 's long-term growth strategy . " The foodservice pie business does not fit our long-term growth strategy . 3 | 0 2029631 2029565 Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war . His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war . 4 | 0 487993 487952 The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat . The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent . 5 | 1 1989515 1989458 The AFL-CIO is waiting until October to decide if it will endorse a candidate . The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries . 6 | 0 1783137 1782659 No dates have been set for the civil or the criminal trial . No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty . 7 | 1 3039165 3039036 Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed . It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status . 8 | -------------------------------------------------------------------------------- /transformers/docs/source/bertology.rst: -------------------------------------------------------------------------------- 1 | BERTology 2 | --------- 3 | 4 | There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are: 5 | 6 | 7 | * BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950 8 | * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650 9 | * What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341 10 | 11 | In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted from the great work of Paul Michel (https://arxiv.org/abs/1905.10650): 12 | 13 | 14 | * accessing all the hidden-states of BERT/GPT/GPT-2, 15 | * accessing all the attention weights for each head of BERT/GPT/GPT-2, 16 | * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650. 17 | 18 | To help you understand and use these features, we have added a specific example script: `bertology.py `_ while extract information and prune a model pre-trained on GLUE. 19 | -------------------------------------------------------------------------------- /transformers/docs/source/main_classes/optimizer_schedules.rst: -------------------------------------------------------------------------------- 1 | Optimizer 2 | ---------------------------------------------------- 3 | 4 | The ``.optimization`` module provides: 5 | 6 | - an optimizer with weight decay fixed that can be used to fine-tuned models, and 7 | - several schedules in the form of schedule objects that inherit from ``_LRSchedule``: 8 | 9 | ``AdamW`` 10 | ~~~~~~~~~~~~~~~~ 11 | 12 | .. autoclass:: transformers.AdamW 13 | :members: 14 | 15 | Schedules 16 | ---------------------------------------------------- 17 | 18 | Learning Rate Schedules 19 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 20 | 21 | .. autoclass:: transformers.ConstantLRSchedule 22 | :members: 23 | 24 | 25 | .. autoclass:: transformers.WarmupConstantSchedule 26 | :members: 27 | 28 | .. image:: /imgs/warmup_constant_schedule.png 29 | :target: /imgs/warmup_constant_schedule.png 30 | :alt: 31 | 32 | 33 | .. autoclass:: transformers.WarmupCosineSchedule 34 | :members: 35 | 36 | .. image:: /imgs/warmup_cosine_schedule.png 37 | :target: /imgs/warmup_cosine_schedule.png 38 | :alt: 39 | 40 | 41 | .. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule 42 | :members: 43 | 44 | .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png 45 | :target: /imgs/warmup_cosine_hard_restarts_schedule.png 46 | :alt: 47 | 48 | 49 | 50 | .. autoclass:: transformers.WarmupLinearSchedule 51 | :members: 52 | 53 | .. image:: /imgs/warmup_linear_schedule.png 54 | :target: /imgs/warmup_linear_schedule.png 55 | :alt: 56 | -------------------------------------------------------------------------------- /transformers/examples/adversarial/README.md: -------------------------------------------------------------------------------- 1 | ## Adversarial evaluation of model performances 2 | 3 | Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi). 4 | 5 | The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans). 6 | 7 | This is an example of using test_hans.py: 8 | 9 | ```bash 10 | export HANS_DIR=path-to-hans 11 | export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc 12 | export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py 13 | 14 | python examples/hans/test_hans.py \ 15 | --task_name hans \ 16 | --model_type $MODEL_TYPE \ 17 | --do_eval \ 18 | --data_dir $HANS_DIR \ 19 | --model_name_or_path $MODEL_PATH \ 20 | --max_seq_length 128 \ 21 | --output_dir $MODEL_PATH \ 22 | ``` 23 | 24 | This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset. 25 | 26 | The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows: 27 | 28 | ```bash 29 | Heuristic entailed results: 30 | lexical_overlap: 0.9702 31 | subsequence: 0.9942 32 | constituent: 0.9962 33 | 34 | Heuristic non-entailed results: 35 | lexical_overlap: 0.199 36 | subsequence: 0.0396 37 | constituent: 0.118 38 | ``` 39 | -------------------------------------------------------------------------------- /transformers/docs/source/notebooks.rst: -------------------------------------------------------------------------------- 1 | Notebooks 2 | ================================================ 3 | 4 | We include `three Jupyter Notebooks `_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model. 5 | 6 | 7 | * 8 | The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb `_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models. 9 | 10 | * 11 | The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb `_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models. 12 | 13 | * 14 | The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb `_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model. 15 | 16 | Please follow the instructions given in the notebooks to run and modify them. 17 | -------------------------------------------------------------------------------- /transformers/transformers/configuration_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ RoBERTa configuration """ 17 | 18 | from __future__ import (absolute_import, division, print_function, 19 | unicode_literals) 20 | 21 | import logging 22 | 23 | from .configuration_bert import BertConfig 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 28 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", 29 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", 30 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", 31 | 'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json", 32 | } 33 | 34 | 35 | class RobertaConfig(BertConfig): 36 | pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 37 | -------------------------------------------------------------------------------- /transformers/docs/source/model_doc/xlm.rst: -------------------------------------------------------------------------------- 1 | XLM 2 | ---------------------------------------------------- 3 | 4 | ``XLMConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.XLMConfig 8 | :members: 9 | 10 | ``XLMTokenizer`` 11 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 12 | 13 | .. autoclass:: transformers.XLMTokenizer 14 | :members: 15 | 16 | ``XLMModel`` 17 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | 19 | .. autoclass:: transformers.XLMModel 20 | :members: 21 | 22 | 23 | ``XLMWithLMHeadModel`` 24 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 25 | 26 | .. autoclass:: transformers.XLMWithLMHeadModel 27 | :members: 28 | 29 | 30 | ``XLMForSequenceClassification`` 31 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 32 | 33 | .. autoclass:: transformers.XLMForSequenceClassification 34 | :members: 35 | 36 | 37 | ``XLMForQuestionAnswering`` 38 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 39 | 40 | .. autoclass:: transformers.XLMForQuestionAnswering 41 | :members: 42 | 43 | 44 | ``TFXLMModel`` 45 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 46 | 47 | .. autoclass:: transformers.TFXLMModel 48 | :members: 49 | 50 | 51 | ``TFXLMWithLMHeadModel`` 52 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 53 | 54 | .. autoclass:: transformers.TFXLMWithLMHeadModel 55 | :members: 56 | 57 | 58 | ``TFXLMForSequenceClassification`` 59 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 60 | 61 | .. autoclass:: transformers.TFXLMForSequenceClassification 62 | :members: 63 | 64 | 65 | ``TFXLMForQuestionAnsweringSimple`` 66 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 67 | 68 | .. autoclass:: transformers.TFXLMForQuestionAnsweringSimple 69 | :members: 70 | -------------------------------------------------------------------------------- /transformers/docs/source/model_doc/distilbert.rst: -------------------------------------------------------------------------------- 1 | DistilBERT 2 | ---------------------------------------------------- 3 | 4 | ``DistilBertConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.DistilBertConfig 8 | :members: 9 | 10 | 11 | ``DistilBertTokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.DistilBertTokenizer 15 | :members: 16 | 17 | 18 | ``DistilBertModel`` 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.DistilBertModel 22 | :members: 23 | 24 | 25 | ``DistilBertForMaskedLM`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.DistilBertForMaskedLM 29 | :members: 30 | 31 | 32 | ``DistilBertForSequenceClassification`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.DistilBertForSequenceClassification 36 | :members: 37 | 38 | 39 | ``DistilBertForQuestionAnswering`` 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.DistilBertForQuestionAnswering 43 | :members: 44 | 45 | ``TFDistilBertModel`` 46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 47 | 48 | .. autoclass:: transformers.TFDistilBertModel 49 | :members: 50 | 51 | 52 | ``TFDistilBertForMaskedLM`` 53 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 54 | 55 | .. autoclass:: transformers.TFDistilBertForMaskedLM 56 | :members: 57 | 58 | 59 | ``TFDistilBertForSequenceClassification`` 60 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 61 | 62 | .. autoclass:: transformers.TFDistilBertForSequenceClassification 63 | :members: 64 | 65 | 66 | ``TFDistilBertForQuestionAnswering`` 67 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 68 | 69 | .. autoclass:: transformers.TFDistilBertForQuestionAnswering 70 | :members: 71 | -------------------------------------------------------------------------------- /transformers/docs/source/model_doc/xlnet.rst: -------------------------------------------------------------------------------- 1 | XLNet 2 | ---------------------------------------------------- 3 | 4 | ``XLNetConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.XLNetConfig 8 | :members: 9 | 10 | 11 | ``XLNetTokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.XLNetTokenizer 15 | :members: 16 | 17 | 18 | ``XLNetModel`` 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.XLNetModel 22 | :members: 23 | 24 | 25 | ``XLNetLMHeadModel`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.XLNetLMHeadModel 29 | :members: 30 | 31 | 32 | ``XLNetForSequenceClassification`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.XLNetForSequenceClassification 36 | :members: 37 | 38 | 39 | ``XLNetForQuestionAnswering`` 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.XLNetForQuestionAnswering 43 | :members: 44 | 45 | 46 | ``TFXLNetModel`` 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | .. autoclass:: transformers.TFXLNetModel 50 | :members: 51 | 52 | 53 | ``TFXLNetLMHeadModel`` 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | .. autoclass:: transformers.TFXLNetLMHeadModel 57 | :members: 58 | 59 | 60 | ``TFXLNetForSequenceClassification`` 61 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 62 | 63 | .. autoclass:: transformers.TFXLNetForSequenceClassification 64 | :members: 65 | 66 | 67 | ``TFXLNetForQuestionAnsweringSimple`` 68 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 69 | 70 | .. autoclass:: transformers.TFXLNetForQuestionAnsweringSimple 71 | :members: 72 | -------------------------------------------------------------------------------- /transformers/transformers/tests/tokenization_auto_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import shutil 21 | import pytest 22 | import logging 23 | 24 | from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer 25 | from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP 26 | 27 | 28 | class AutoTokenizerTest(unittest.TestCase): 29 | def test_tokenizer_from_pretrained(self): 30 | logging.basicConfig(level=logging.INFO) 31 | for model_name in list(BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]: 32 | tokenizer = AutoTokenizer.from_pretrained(model_name) 33 | self.assertIsNotNone(tokenizer) 34 | self.assertIsInstance(tokenizer, BertTokenizer) 35 | self.assertGreater(len(tokenizer), 0) 36 | 37 | for model_name in list(GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]: 38 | tokenizer = AutoTokenizer.from_pretrained(model_name) 39 | self.assertIsNotNone(tokenizer) 40 | self.assertIsInstance(tokenizer, GPT2Tokenizer) 41 | self.assertGreater(len(tokenizer), 0) 42 | 43 | 44 | if __name__ == "__main__": 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /transformers/transformers/tests/tokenization_utils_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 HuggingFace Inc.. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import six 21 | 22 | from transformers import PreTrainedTokenizer 23 | from transformers.tokenization_gpt2 import GPT2Tokenizer 24 | 25 | class TokenizerUtilsTest(unittest.TestCase): 26 | def check_tokenizer_from_pretrained(self, tokenizer_class): 27 | s3_models = list(tokenizer_class.max_model_input_sizes.keys()) 28 | for model_name in s3_models[:1]: 29 | tokenizer = tokenizer_class.from_pretrained(model_name) 30 | self.assertIsNotNone(tokenizer) 31 | self.assertIsInstance(tokenizer, tokenizer_class) 32 | self.assertIsInstance(tokenizer, PreTrainedTokenizer) 33 | 34 | for special_tok in tokenizer.all_special_tokens: 35 | if six.PY2: 36 | self.assertIsInstance(special_tok, unicode) 37 | else: 38 | self.assertIsInstance(special_tok, str) 39 | special_tok_id = tokenizer.convert_tokens_to_ids(special_tok) 40 | self.assertIsInstance(special_tok_id, int) 41 | 42 | def test_pretrained_tokenizers(self): 43 | self.check_tokenizer_from_pretrained(GPT2Tokenizer) 44 | 45 | if __name__ == "__main__": 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /transformers/transformers/tests/tokenization_distilbert_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | from io import open 20 | 21 | from transformers.tokenization_distilbert import (DistilBertTokenizer) 22 | 23 | from .tokenization_tests_commons import CommonTestCases 24 | from .tokenization_bert_test import BertTokenizationTest 25 | 26 | class DistilBertTokenizationTest(BertTokenizationTest): 27 | 28 | tokenizer_class = DistilBertTokenizer 29 | 30 | def get_tokenizer(self, **kwargs): 31 | return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs) 32 | 33 | def test_sequence_builders(self): 34 | tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") 35 | 36 | text = tokenizer.encode("sequence builders") 37 | text_2 = tokenizer.encode("multi-sequence build") 38 | 39 | encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) 40 | encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) 41 | 42 | assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] 43 | assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \ 44 | text_2 + [tokenizer.sep_token_id] 45 | 46 | 47 | if __name__ == '__main__': 48 | unittest.main() 49 | -------------------------------------------------------------------------------- /transformers/docs/README.md: -------------------------------------------------------------------------------- 1 | # Generating the documentation 2 | 3 | To generate the documentation, you first have to build it. Several packages are necessary to build the doc, 4 | you can install them using: 5 | 6 | ```bash 7 | pip install -r requirements.txt 8 | ``` 9 | 10 | ## Packages installed 11 | 12 | Here's an overview of all the packages installed. If you ran the previous command installing all packages from 13 | `requirements.txt`, you do not need to run the following commands. 14 | 15 | Building it requires the package `sphinx` that you can 16 | install using: 17 | 18 | ```bash 19 | pip install -U sphinx 20 | ``` 21 | 22 | You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by 23 | [Read The Docs](https://readthedocs.org/). You can install it using the following command: 24 | 25 | ```bash 26 | pip install sphinx_rtd_theme 27 | ``` 28 | 29 | The third necessary package is the `recommonmark` package to accept Markdown as well as Restructured text: 30 | 31 | ```bash 32 | pip install recommonmark 33 | ``` 34 | 35 | ## Building the documentation 36 | 37 | Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following 38 | command to generate it: 39 | 40 | ```bash 41 | ln -s ../../examples/README.md examples.md 42 | ``` 43 | 44 | Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder: 45 | 46 | ```bash 47 | make html 48 | ``` 49 | 50 | --- 51 | **NOTE** 52 | 53 | If you are adding/removing elements from the toc-tree or from any structural item, it is recommended to clean the build 54 | directory before rebuilding. Run the following command to clean and build: 55 | 56 | ```bash 57 | make clean && make html 58 | ``` 59 | 60 | --- 61 | 62 | It should build the static app that will be available under `/docs/_build/html` 63 | 64 | ## Adding a new element to the tree (toc-tree) 65 | 66 | Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it 67 | in the source directory. You can then link it to the toc-tree by putting the filename without the extension. 68 | -------------------------------------------------------------------------------- /transformers/examples/distillation/scripts/token_counts.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | Preprocessing script before training the distilled model. 17 | """ 18 | from collections import Counter 19 | import argparse 20 | import pickle 21 | import logging 22 | 23 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 24 | datefmt = '%m/%d/%Y %H:%M:%S', 25 | level = logging.INFO) 26 | logger = logging.getLogger(__name__) 27 | 28 | if __name__ == '__main__': 29 | parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)") 30 | parser.add_argument("--data_file", type=str, default="data/dump.bert-base-uncased.pickle", 31 | help="The binarized dataset.") 32 | parser.add_argument("--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle", 33 | help="The dump file.") 34 | parser.add_argument("--vocab_size", default=30522, type=int) 35 | args = parser.parse_args() 36 | 37 | logger.info(f'Loading data from {args.data_file}') 38 | with open(args.data_file, 'rb') as fp: 39 | data = pickle.load(fp) 40 | 41 | logger.info('Counting occurences for MLM.') 42 | counter = Counter() 43 | for tk_ids in data: 44 | counter.update(tk_ids) 45 | counts = [0]*args.vocab_size 46 | for k, v in counter.items(): 47 | counts[k] = v 48 | 49 | logger.info(f'Dump to {args.token_counts_dump}') 50 | with open(args.token_counts_dump, 'wb') as handle: 51 | pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL) 52 | -------------------------------------------------------------------------------- /transformers/.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ 117 | 118 | # vscode 119 | .vscode 120 | 121 | # Pycharm 122 | .idea 123 | 124 | # TF code 125 | tensorflow_code 126 | 127 | # Models 128 | models 129 | proc_data 130 | 131 | # examples 132 | runs 133 | examples/runs 134 | 135 | # data 136 | /data 137 | serialization_dir 138 | 139 | # emacs 140 | *.*~ -------------------------------------------------------------------------------- /transformers/transformers/tests/configuration_common_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 HuggingFace Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import copy 20 | import os 21 | import shutil 22 | import json 23 | import random 24 | import uuid 25 | 26 | import unittest 27 | import logging 28 | 29 | 30 | class ConfigTester(object): 31 | def __init__(self, parent, config_class=None, **kwargs): 32 | self.parent = parent 33 | self.config_class = config_class 34 | self.inputs_dict = kwargs 35 | 36 | def create_and_test_config_common_properties(self): 37 | config = self.config_class(**self.inputs_dict) 38 | self.parent.assertTrue(hasattr(config, 'vocab_size')) 39 | self.parent.assertTrue(hasattr(config, 'hidden_size')) 40 | self.parent.assertTrue(hasattr(config, 'num_attention_heads')) 41 | self.parent.assertTrue(hasattr(config, 'num_hidden_layers')) 42 | 43 | def create_and_test_config_to_json_string(self): 44 | config = self.config_class(**self.inputs_dict) 45 | obj = json.loads(config.to_json_string()) 46 | for key, value in self.inputs_dict.items(): 47 | self.parent.assertEqual(obj[key], value) 48 | 49 | def create_and_test_config_to_json_file(self): 50 | config_first = self.config_class(**self.inputs_dict) 51 | json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json") 52 | config_first.to_json_file(json_file_path) 53 | config_second = self.config_class.from_json_file(json_file_path) 54 | os.remove(json_file_path) 55 | self.parent.assertEqual(config_second.to_dict(), config_first.to_dict()) 56 | 57 | def run_common_tests(self): 58 | self.create_and_test_config_common_properties() 59 | self.create_and_test_config_to_json_string() 60 | self.create_and_test_config_to_json_file() 61 | 62 | if __name__ == "__main__": 63 | unittest.main() -------------------------------------------------------------------------------- /transformers/transformers/tokenization_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for DistilBERT.""" 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import collections 20 | import logging 21 | import os 22 | import unicodedata 23 | from io import open 24 | 25 | from .tokenization_bert import BertTokenizer 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} 30 | 31 | PRETRAINED_VOCAB_FILES_MAP = { 32 | 'vocab_file': 33 | { 34 | 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 35 | 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", 36 | } 37 | } 38 | 39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 40 | 'distilbert-base-uncased': 512, 41 | 'distilbert-base-uncased-distilled-squad': 512, 42 | } 43 | 44 | 45 | class DistilBertTokenizer(BertTokenizer): 46 | r""" 47 | Constructs a DistilBertTokenizer. 48 | :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece 49 | 50 | Args: 51 | vocab_file: Path to a one-wordpiece-per-line vocabulary file 52 | do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False 53 | do_basic_tokenize: Whether to do basic tokenization before wordpiece. 54 | max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the 55 | minimum of this value (if specified) and the underlying BERT model's sequence length. 56 | never_split: List of tokens which will never be split during tokenization. Only has an effect when 57 | do_wordpiece_only=False 58 | """ 59 | 60 | vocab_files_names = VOCAB_FILES_NAMES 61 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 62 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 63 | -------------------------------------------------------------------------------- /transformers/examples/new_get_conll.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from argparse import ArgumentParser 3 | from pathlib import Path 4 | import json 5 | import pickle 6 | import os 7 | 8 | import stanza 9 | from stanza.utils.conll import CoNLL 10 | import jsonlines 11 | 12 | from parallel import parallelized 13 | 14 | def process_lines(lines, nlp): 15 | try: 16 | doc = nlp(lines) 17 | conll = CoNLL.convert_dict(doc.to_dict()) 18 | return conll 19 | 20 | except Exception as e: 21 | # # print(e) 22 | return [] 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | def main(): 31 | parser = ArgumentParser() 32 | parser.add_argument('--data_path', type=Path, required=True) 33 | parser.add_argument('--generated_data_path', type=str, required=True) 34 | parser.add_argument("--max_num_proc", default=20, type=int) 35 | parser.add_argument("--pos_batch_size", default=15000, type=int) 36 | parser.add_argument("--tokenize_batch_size", default=15000, type=int) 37 | 38 | args = parser.parse_args() 39 | 40 | assert args.data_path.exists(), "data does not exist" 41 | # assert args.generated_data_path.is_dir(), "export path does not exist" 42 | 43 | 44 | stanza.download('en') 45 | nlp = stanza.Pipeline(processors='tokenize,mwt,pos,lemma,depparse', 46 | pos_batch_size=args.pos_batch_size, 47 | tokenize_batch_size=args.tokenize_batch_size, 48 | tokenize_pretokenized=False, 49 | mwt_batch_size=15000, 50 | lemma_batch_size=15000, 51 | depparse_batch_size=15000) 52 | 53 | line_count = 0 54 | lines = [] 55 | with args.data_path.open() as f: 56 | for line in tqdm(f, desc="Loading Dataset", unit=" lines"): 57 | line = line.strip() 58 | if len(line.split()) > 7 and len(line.split()) <= 20: 59 | lines.append(line) 60 | 61 | 62 | # file_count = 0 63 | print(len(lines)) 64 | writer = open(args.generated_data_path, 'w') 65 | 66 | 67 | for batch in tqdm(range(0, len(lines), 15000)): 68 | 69 | data = lines[batch:batch + 15000] 70 | data = '\n'.join(data) 71 | conlls = process_lines(data, nlp) 72 | # conlls = parallelized(process_line, job_kwargs, max_workers=args.max_num_proc, progress=False) 73 | for conll_line in conlls: 74 | if len(conll_line) > 0: 75 | for conll_words in conll_line: 76 | writer.write("\t".join(conll_words) + '\n') 77 | writer.write("\n") 78 | 79 | writer.close() 80 | print("done") 81 | 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /transformers/docs/source/main_classes/processors.rst: -------------------------------------------------------------------------------- 1 | Processors 2 | ---------------------------------------------------- 3 | 4 | This library includes processors for several traditional tasks. These processors can be used to process a dataset into 5 | examples that can be fed to a model. 6 | 7 | Processors 8 | ~~~~~~~~~~~~~~~~~~~~~ 9 | 10 | All processors follow the same architecture which is that of the 11 | :class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list 12 | of :class:`~transformers.data.processors.utils.InputExample`. These 13 | :class:`~transformers.data.processors.utils.InputExample` can be converted to 14 | :class:`~transformers.data.processors.utils.InputFeatures` in order to be fed to the model. 15 | 16 | .. autoclass:: transformers.data.processors.utils.DataProcessor 17 | :members: 18 | 19 | 20 | .. autoclass:: transformers.data.processors.utils.InputExample 21 | :members: 22 | 23 | 24 | .. autoclass:: transformers.data.processors.utils.InputFeatures 25 | :members: 26 | 27 | 28 | GLUE 29 | ~~~~~~~~~~~~~~~~~~~~~ 30 | 31 | `General Language Understanding Evaluation (GLUE) `__ is a benchmark that evaluates 32 | the performance of models across a diverse set of existing NLU tasks. It was released together with the paper 33 | `GLUE: A multi-task benchmark and analysis platform for natural language understanding `__ 34 | 35 | This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched), 36 | CoLA, SST2, STSB, QQP, QNLI, RTE and WNLI. 37 | 38 | Those processors are: 39 | - :class:`~transformers.data.processors.utils.MrpcProcessor` 40 | - :class:`~transformers.data.processors.utils.MnliProcessor` 41 | - :class:`~transformers.data.processors.utils.MnliMismatchedProcessor` 42 | - :class:`~transformers.data.processors.utils.Sst2Processor` 43 | - :class:`~transformers.data.processors.utils.StsbProcessor` 44 | - :class:`~transformers.data.processors.utils.QqpProcessor` 45 | - :class:`~transformers.data.processors.utils.QnliProcessor` 46 | - :class:`~transformers.data.processors.utils.RteProcessor` 47 | - :class:`~transformers.data.processors.utils.WnliProcessor` 48 | 49 | Additionally, the following method can be used to load values from a data file and convert them to a list of 50 | :class:`~transformers.data.processors.utils.InputExample`. 51 | 52 | .. automethod:: transformers.data.processors.glue.glue_convert_examples_to_features 53 | 54 | Example usage 55 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 56 | 57 | An example using these processors is given in the 58 | `run_glue.py `__ script. -------------------------------------------------------------------------------- /transformers/examples/get_conll.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from argparse import ArgumentParser 3 | from pathlib import Path 4 | import json 5 | import pickle 6 | 7 | import stanza 8 | from stanza.utils.conll import CoNLL 9 | import jsonlines 10 | 11 | from parallel import parallelized 12 | 13 | def process_line(line, nlp): 14 | try: 15 | doc = nlp(line) 16 | conll = CoNLL.convert_dict(doc.to_dict()) 17 | return conll[0] 18 | 19 | except Exception as e: 20 | # # print(e) 21 | return [] 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | def main(): 30 | parser = ArgumentParser() 31 | parser.add_argument('--data_path', type=Path, required=True) 32 | parser.add_argument('--generated_data_path', type=str, required=True) 33 | parser.add_argument("--max_num_proc", default=20, type=int) 34 | parser.add_argument("--pos_batch_size", default=3000, type=int) 35 | parser.add_argument("--tokenize_batch_size", default=128, type=int) 36 | 37 | args = parser.parse_args() 38 | 39 | assert args.data_path.exists(), "data does not exist" 40 | # assert args.generated_data_path.is_dir(), "export path does not exist" 41 | stanza.download('en') 42 | nlp = stanza.Pipeline(processors='tokenize,pos,mwt,lemma,depparse', 43 | pos_batch_size=args.pos_batch_size, 44 | tokenize_batch_size=args.tokenize_batch_size, 45 | tokenize_pretokenized=False, 46 | mwt_batch_size=100, 47 | lemma_batch_size=100, 48 | depparse_batch_size=6000) 49 | 50 | line_count = 0 51 | lines = [] 52 | with args.data_path.open() as f: 53 | for line in tqdm(f, desc="Loading Dataset", unit=" lines"): 54 | line = line.strip() 55 | if len(line.split()) > 7 and len(line.split()) <= 20: 56 | lines.append(line) 57 | 58 | 59 | # file_count = 0 60 | print(len(lines)) 61 | writer = open(args.generated_data_path, 'w') 62 | 63 | 64 | for batch in tqdm(range(0, len(lines), 1000)): 65 | job_kwargs = [] 66 | for line in lines[batch:batch + 1000]: 67 | job_kwargs.append({"line": line, "nlp": nlp}) 68 | # res = check_line(line, templates, nlp) 69 | conlls = parallelized(process_line, job_kwargs, max_workers=args.max_num_proc, progress=False) 70 | 71 | for conll_line in conlls: 72 | if len(conll_line) > 0: 73 | for conll_words in conll_line: 74 | writer.write("\t".join(conll_words) + '\n') 75 | writer.write("\n") 76 | 77 | writer.close() 78 | print("done") 79 | 80 | if __name__ == '__main__': 81 | main() 82 | -------------------------------------------------------------------------------- /transformers/docs/source/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0 4 | 5 | ## With pip 6 | 7 | PyTorch Transformers can be installed using pip as follows: 8 | 9 | ``` bash 10 | pip install transformers 11 | ``` 12 | 13 | ## From source 14 | 15 | To install from source, clone the repository and install with: 16 | 17 | ``` bash 18 | git clone https://github.com/huggingface/transformers.git 19 | cd transformers 20 | pip install [--editable] . 21 | ``` 22 | 23 | ## Tests 24 | 25 | An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples). 26 | 27 | Tests can be run using `pytest` (install pytest if needed with `pip install pytest`). 28 | 29 | Run all the tests from the root of the cloned repository with the commands: 30 | 31 | ``` bash 32 | python -m pytest -sv ./transformers/tests/ 33 | python -m pytest -sv ./examples/ 34 | ``` 35 | 36 | ## OpenAI GPT original tokenization workflow 37 | 38 | If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`: 39 | 40 | ``` bash 41 | pip install spacy ftfy==4.4.3 42 | python -m spacy download en 43 | ``` 44 | 45 | If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry). 46 | 47 | ## Note on model downloads (Continuous Integration or large-scale deployments) 48 | 49 | If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help. 50 | 51 | ## Do you want to run a Transformer model on a mobile device? 52 | 53 | You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo. 54 | 55 | It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices. 56 | 57 | At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML, 58 | or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting! 59 | -------------------------------------------------------------------------------- /transformers/examples/lm_training/fp16_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import apex 3 | 4 | 5 | FP16_ENABLED = False 6 | # Apex setting. O2 level means we keep batchnorms layer and a copy of the weight in fp32 7 | OPTIMIZATION_LEVEL = 'O2' 8 | LOSS_SCALING_ENABLED = True 9 | 10 | 11 | def disable_loss_scaling(): 12 | global LOSS_SCALING_ENABLED 13 | LOSS_SCALING_ENABLED = False 14 | return 15 | 16 | 17 | def enable_fp16(): 18 | global FP16_ENABLED 19 | FP16_ENABLED = True 20 | return 21 | 22 | 23 | def is_fp16(): 24 | return FP16_ENABLED 25 | 26 | 27 | def is_loss_scaling_enabled(): 28 | return LOSS_SCALING_ENABLED 29 | 30 | 31 | def get_optim_level(): 32 | return OPTIMIZATION_LEVEL 33 | 34 | 35 | def set_optim_level(opt_level): 36 | global OPTIMIZATION_LEVEL 37 | OPTIMIZATION_LEVEL = opt_level 38 | return 39 | 40 | 41 | def clip_grad(optimizer, parameters, norm=5.): 42 | if is_fp16(): 43 | torch.nn.utils.clip_grad_norm_(apex.amp.master_params(optimizer), norm) 44 | else: 45 | torch.nn.utils.clip_grad_norm_(parameters, norm) 46 | 47 | 48 | # Convert a tensor to half precision if FP16_ENABLED 49 | def maybe_half(tensor): 50 | return tensor.half() if is_fp16() else tensor 51 | 52 | 53 | def initialize(model, lr=0.0005): 54 | if is_fp16(): 55 | # from apex.optimizers import FP16_Optimizer 56 | # from apex.optimizers import FusedAdam 57 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 58 | model, optimizer = apex.amp.initialize(model, optimizer, opt_level=get_optim_level()) 59 | # model = apex.parallel.DistributedDataParallel(model) 60 | # model = model.half() 61 | # optimizer = FusedAdam(model.parameters(), lr=lr, bias_correction=False) 62 | # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) 63 | return model, optimizer 64 | else: 65 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 66 | return model, optimizer 67 | 68 | 69 | def get_optimizer(obj): 70 | from apex.fp16_utils import FP16_Optimizer 71 | 72 | # Apex introduce the FP16_optimizer object 73 | # However this isn't really an optimizer, but only a wrapper around one. 74 | # This function returns the actual optimizer. 75 | if type(obj) == FP16_Optimizer: 76 | return obj.optimizer 77 | # If obj is not an FP16_Optimizer then we are not running in mixed precision 78 | # and the passed object is already an actual optimizer 79 | return obj 80 | 81 | 82 | def backward(loss, optimizer): 83 | if FP16_ENABLED and is_loss_scaling_enabled(): 84 | # optimizer.backward(loss) 85 | with apex.amp.scale_loss(loss, optimizer) as scaled_loss: 86 | scaled_loss.backward() 87 | else: 88 | loss.backward() 89 | return 90 | 91 | 92 | # 93 | # EYE BUFFER 94 | # 95 | -------------------------------------------------------------------------------- /transformers/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import argparse 22 | import torch 23 | 24 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert 25 | 26 | import logging 27 | logging.basicConfig(level=logging.INFO) 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = BertConfig.from_json_file(bert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = BertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_bert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | ## Required parameters 46 | parser.add_argument("--tf_checkpoint_path", 47 | default = None, 48 | type = str, 49 | required = True, 50 | help = "Path to the TensorFlow checkpoint path.") 51 | parser.add_argument("--bert_config_file", 52 | default = None, 53 | type = str, 54 | required = True, 55 | help = "The config json file corresponding to the pre-trained BERT model. \n" 56 | "This specifies the model architecture.") 57 | parser.add_argument("--pytorch_dump_path", 58 | default = None, 59 | type = str, 60 | required = True, 61 | help = "Path to the output PyTorch model.") 62 | args = parser.parse_args() 63 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, 64 | args.bert_config_file, 65 | args.pytorch_dump_path) 66 | -------------------------------------------------------------------------------- /transformers/transformers/tests/tokenization_openai_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | import json 20 | 21 | from transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES 22 | 23 | from .tokenization_tests_commons import CommonTestCases 24 | 25 | 26 | class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester): 27 | 28 | tokenizer_class = OpenAIGPTTokenizer 29 | 30 | def setUp(self): 31 | super(OpenAIGPTTokenizationTest, self).setUp() 32 | 33 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 34 | vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", 35 | "w", "r", "t", 36 | "lo", "low", "er", 37 | "low", "lowest", "newer", "wider", ""] 38 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 39 | merges = ["#version: 0.2", "l o", "lo w", "e r", ""] 40 | 41 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) 42 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) 43 | with open(self.vocab_file, "w") as fp: 44 | fp.write(json.dumps(vocab_tokens)) 45 | with open(self.merges_file, "w") as fp: 46 | fp.write("\n".join(merges)) 47 | 48 | def get_tokenizer(self, **kwargs): 49 | return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs) 50 | 51 | def get_input_output_texts(self): 52 | input_text = u"lower newer" 53 | output_text = u"lower newer" 54 | return input_text, output_text 55 | 56 | 57 | def test_full_tokenizer(self): 58 | tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file) 59 | 60 | text = "lower" 61 | bpe_tokens = ["low", "er"] 62 | tokens = tokenizer.tokenize(text) 63 | self.assertListEqual(tokens, bpe_tokens) 64 | 65 | input_tokens = tokens + [""] 66 | input_bpe_tokens = [14, 15, 20] 67 | self.assertListEqual( 68 | tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 69 | 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /transformers/docs/source/benchmarks.md: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | 3 | This section is dedicated to the Benchmarks done by the library, both by maintainers, contributors and users. These 4 | benchmark will help keep track of the preformance improvements that are brought to our models across versions. 5 | 6 | ## Benchmarking all models for inference 7 | 8 | As of version 2.1 we have benchmarked all models for inference, across many different settings: using PyTorch, with 9 | and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for 10 | TensorFlow XLA) and GPUs. 11 | 12 | The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2) 13 | 14 | The results are available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing). 15 | 16 | ## TF2 with mixed precision, XLA, Distribution (@tlkh) 17 | 18 | This work was done by [Timothy Liu](https://github.com/tlkh). 19 | 20 | There are very positive results to be gained from the various TensorFlow 2.0 features: 21 | 22 | - Automatic Mixed Precision (AMP) 23 | - XLA compiler 24 | - Distribution strategies (multi-GPU) 25 | 26 | The benefits are listed here (tested on CoLA, MRPC, SST-2): 27 | 28 | - AMP: Between 1.4x to 1.6x decrease in overall time without change in batch size 29 | - AMP+XLA: Up to 2.5x decrease in overall time on SST-2 (larger dataset) 30 | - Distribution: Between 1.4x to 3.4x decrease in overall time on 4xV100 31 | - Combined: Up to 5.7x decrease in overall training time, or 9.1x training throughput 32 | 33 | The model quality (measured by the validation accuracy) fluctuates slightly. Taking an average of 4 training runs 34 | on a single GPU gives the following results: 35 | 36 | - CoLA: AMP results in slighter lower acc (0.820 vs 0.824) 37 | - MRPC: AMP results in lower acc (0.823 vs 0.835) 38 | - SST-2: AMP results in slighter lower acc (0.918 vs 0.922) 39 | 40 | However, in a distributed setting with 4xV100 (4x batch size), AMP can yield in better results: 41 | 42 | CoLA: AMP results in higher acc (0.828 vs 0.812) 43 | MRPC: AMP results in lower acc (0.817 vs 0.827) 44 | SST-2: AMP results in slightly lower acc (0.926 vs 0.929) 45 | 46 | The benchmark script is available [here](https://github.com/NVAITC/benchmarking/blob/master/tf2/bert_dist.py). 47 | 48 | Note: on some tasks (e.g. MRPC), the dataset is too small. The overhead due to the model compilation with XLA as well 49 | as the distribution strategy setup does not speed things up. The XLA compile time is also the reason why although throughput 50 | can increase a lot (e.g. 2.7x for single GPU), overall (end-to-end) training speed-up is not as fast (as low as 1.4x) 51 | 52 | The benefits as seen on SST-2 (larger dataset) is much clear. 53 | 54 | All results can be seen on this [Google Sheet](https://docs.google.com/spreadsheets/d/1538MN224EzjbRL239sqSiUy6YY-rAjHyXhTzz_Zptls/edit#gid=960868445). 55 | -------------------------------------------------------------------------------- /transformers/docs/source/model_doc/bert.rst: -------------------------------------------------------------------------------- 1 | BERT 2 | ---------------------------------------------------- 3 | 4 | ``BertConfig`` 5 | ~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. autoclass:: transformers.BertConfig 8 | :members: 9 | 10 | 11 | ``BertTokenizer`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.BertTokenizer 15 | :members: 16 | 17 | 18 | ``BertModel`` 19 | ~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.BertModel 22 | :members: 23 | 24 | 25 | ``BertForPreTraining`` 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.BertForPreTraining 29 | :members: 30 | 31 | 32 | ``BertForMaskedLM`` 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.BertForMaskedLM 36 | :members: 37 | 38 | 39 | ``BertForNextSentencePrediction`` 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.BertForNextSentencePrediction 43 | :members: 44 | 45 | 46 | ``BertForSequenceClassification`` 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | .. autoclass:: transformers.BertForSequenceClassification 50 | :members: 51 | 52 | 53 | ``BertForMultipleChoice`` 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | .. autoclass:: transformers.BertForMultipleChoice 57 | :members: 58 | 59 | 60 | ``BertForTokenClassification`` 61 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 62 | 63 | .. autoclass:: transformers.BertForTokenClassification 64 | :members: 65 | 66 | 67 | ``BertForQuestionAnswering`` 68 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 69 | 70 | .. autoclass:: transformers.BertForQuestionAnswering 71 | :members: 72 | 73 | 74 | ``TFBertModel`` 75 | ~~~~~~~~~~~~~~~~~~~~ 76 | 77 | .. autoclass:: transformers.TFBertModel 78 | :members: 79 | 80 | 81 | ``TFBertForPreTraining`` 82 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 83 | 84 | .. autoclass:: transformers.TFBertForPreTraining 85 | :members: 86 | 87 | 88 | ``TFBertForMaskedLM`` 89 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 90 | 91 | .. autoclass:: transformers.TFBertForMaskedLM 92 | :members: 93 | 94 | 95 | ``TFBertForNextSentencePrediction`` 96 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 97 | 98 | .. autoclass:: transformers.TFBertForNextSentencePrediction 99 | :members: 100 | 101 | 102 | ``TFBertForSequenceClassification`` 103 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 104 | 105 | .. autoclass:: transformers.TFBertForSequenceClassification 106 | :members: 107 | 108 | 109 | ``TFBertForMultipleChoice`` 110 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 111 | 112 | .. autoclass:: transformers.TFBertForMultipleChoice 113 | :members: 114 | 115 | 116 | ``TFBertForTokenClassification`` 117 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 118 | 119 | .. autoclass:: transformers.TFBertForTokenClassification 120 | :members: 121 | 122 | 123 | ``TFBertForQuestionAnswering`` 124 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 125 | 126 | .. autoclass:: transformers.TFBertForQuestionAnswering 127 | :members: 128 | 129 | -------------------------------------------------------------------------------- /transformers/transformers/tests/tokenization_ctrl_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Salesforce and HuggingFace Inc. team. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from __future__ import absolute_import, division, print_function, unicode_literals 15 | 16 | import os 17 | import unittest 18 | import json 19 | from io import open 20 | 21 | from transformers.tokenization_ctrl import CTRLTokenizer, VOCAB_FILES_NAMES 22 | 23 | from .tokenization_tests_commons import CommonTestCases 24 | 25 | class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester): 26 | 27 | tokenizer_class = CTRLTokenizer 28 | 29 | def setUp(self): 30 | super(CTRLTokenizationTest, self).setUp() 31 | 32 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 33 | vocab = ['adapt', 're@@', 'a@@', 'apt', 'c@@', 't', ''] 34 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 35 | merges = ["#version: 0.2", 'a p', 'ap t', 'r e', 'a d', 'ad apt', ''] 36 | self.special_tokens_map = {"unk_token": ""} 37 | 38 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) 39 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) 40 | with open(self.vocab_file, "w", encoding="utf-8") as fp: 41 | fp.write(json.dumps(vocab_tokens) + "\n") 42 | with open(self.merges_file, "w", encoding="utf-8") as fp: 43 | fp.write("\n".join(merges)) 44 | 45 | def get_tokenizer(self, **kwargs): 46 | kwargs.update(self.special_tokens_map) 47 | return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs) 48 | 49 | def get_input_output_texts(self): 50 | input_text = u"adapt react readapt apt" 51 | output_text = u"adapt react readapt apt" 52 | return input_text, output_text 53 | 54 | def test_full_tokenizer(self): 55 | tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) 56 | text = "adapt react readapt apt" 57 | bpe_tokens = 'adapt re@@ a@@ c@@ t re@@ adapt apt'.split() 58 | tokens = tokenizer.tokenize(text) 59 | self.assertListEqual(tokens, bpe_tokens) 60 | 61 | input_tokens = tokens + [tokenizer.unk_token] 62 | 63 | input_bpe_tokens = [0, 1, 2, 4, 5, 1, 0, 3, 6] 64 | self.assertListEqual( 65 | tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 66 | 67 | 68 | if __name__ == '__main__': 69 | unittest.main() 70 | -------------------------------------------------------------------------------- /transformers/setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py 3 | 4 | To create the package for pypi. 5 | 6 | 1. Change the version in __init__.py, setup.py as well as docs/source/conf.py. 7 | 8 | 2. Commit these changes with the message: "Release: VERSION" 9 | 10 | 3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' " 11 | Push the tag to git: git push --tags origin master 12 | 13 | 4. Build both the sources and the wheel. Do not change anything in setup.py between 14 | creating the wheel and the source distribution (obviously). 15 | 16 | For the wheel, run: "python setup.py bdist_wheel" in the top level directory. 17 | (this will build a wheel for the python version you use to build it - make sure you use python 3.x). 18 | 19 | For the sources, run: "python setup.py sdist" 20 | You should now have a /dist directory with both .whl and .tar.gz source versions. 21 | 22 | 5. Check that everything looks correct by uploading the package to the pypi test server: 23 | 24 | twine upload dist/* -r pypitest 25 | (pypi suggest using twine as other methods upload files via plaintext.) 26 | 27 | Check that you can install it in a virtualenv by running: 28 | pip install -i https://testpypi.python.org/pypi transformers 29 | 30 | 6. Upload the final version to actual pypi: 31 | twine upload dist/* -r pypi 32 | 33 | 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory. 34 | 35 | """ 36 | from io import open 37 | from setuptools import find_packages, setup 38 | 39 | setup( 40 | name="transformers", 41 | version="2.1.1", 42 | author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors", 43 | author_email="thomas@huggingface.co", 44 | description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch", 45 | long_description=open("README.md", "r", encoding='utf-8').read(), 46 | long_description_content_type="text/markdown", 47 | keywords='NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU', 48 | license='Apache', 49 | url="https://github.com/huggingface/transformers", 50 | packages=find_packages(exclude=["*.tests", "*.tests.*", 51 | "tests.*", "tests"]), 52 | install_requires=['numpy', 53 | 'boto3', 54 | 'requests', 55 | 'tqdm', 56 | 'regex', 57 | 'sentencepiece', 58 | 'sacremoses'], 59 | entry_points={ 60 | 'console_scripts': [ 61 | "transformers=transformers.__main__:main", 62 | ] 63 | }, 64 | # python_requires='>=3.5.0', 65 | tests_require=['pytest'], 66 | classifiers=[ 67 | 'Intended Audience :: Science/Research', 68 | 'License :: OSI Approved :: Apache Software License', 69 | 'Programming Language :: Python :: 3', 70 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 71 | ], 72 | ) 73 | -------------------------------------------------------------------------------- /transformers/transformers/tests/tokenization_gpt2_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | import json 20 | from io import open 21 | 22 | from transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES 23 | 24 | from .tokenization_tests_commons import CommonTestCases 25 | 26 | class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): 27 | 28 | tokenizer_class = GPT2Tokenizer 29 | 30 | def setUp(self): 31 | super(GPT2TokenizationTest, self).setUp() 32 | 33 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 34 | vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", 35 | "\u0120", "\u0120l", "\u0120n", 36 | "\u0120lo", "\u0120low", "er", 37 | "\u0120lowest", "\u0120newer", "\u0120wider", ""] 38 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 39 | merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] 40 | self.special_tokens_map = {"unk_token": ""} 41 | 42 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) 43 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) 44 | with open(self.vocab_file, "w", encoding="utf-8") as fp: 45 | fp.write(json.dumps(vocab_tokens) + "\n") 46 | with open(self.merges_file, "w", encoding="utf-8") as fp: 47 | fp.write("\n".join(merges)) 48 | 49 | def get_tokenizer(self, **kwargs): 50 | kwargs.update(self.special_tokens_map) 51 | return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs) 52 | 53 | def get_input_output_texts(self): 54 | input_text = u"lower newer" 55 | output_text = u"lower newer" 56 | return input_text, output_text 57 | 58 | def test_full_tokenizer(self): 59 | tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) 60 | text = "lower newer" 61 | bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"] 62 | tokens = tokenizer.tokenize(text, add_prefix_space=True) 63 | self.assertListEqual(tokens, bpe_tokens) 64 | 65 | input_tokens = tokens + [tokenizer.unk_token] 66 | input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] 67 | self.assertListEqual( 68 | tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 69 | 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /transformers/examples/run_tf_glue.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | import tensorflow_datasets 4 | from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features, BertForSequenceClassification 5 | 6 | # script parameters 7 | BATCH_SIZE = 32 8 | EVAL_BATCH_SIZE = BATCH_SIZE * 2 9 | USE_XLA = False 10 | USE_AMP = False 11 | 12 | tf.config.optimizer.set_jit(USE_XLA) 13 | tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP}) 14 | 15 | # Load tokenizer and model from pretrained model/vocabulary 16 | tokenizer = BertTokenizer.from_pretrained('bert-base-cased') 17 | model = TFBertForSequenceClassification.from_pretrained('bert-base-cased') 18 | 19 | # Load dataset via TensorFlow Datasets 20 | data, info = tensorflow_datasets.load('glue/mrpc', with_info=True) 21 | train_examples = info.splits['train'].num_examples 22 | valid_examples = info.splits['validation'].num_examples 23 | 24 | # Prepare dataset for GLUE as a tf.data.Dataset instance 25 | train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc') 26 | valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc') 27 | train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1) 28 | valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE) 29 | 30 | # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 31 | opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08) 32 | if USE_AMP: 33 | # loss scaling is currently required when using mixed precision 34 | opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic') 35 | loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) 36 | metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') 37 | model.compile(optimizer=opt, loss=loss, metrics=[metric]) 38 | 39 | # Train and evaluate using tf.keras.Model.fit() 40 | train_steps = train_examples//BATCH_SIZE 41 | valid_steps = valid_examples//EVAL_BATCH_SIZE 42 | 43 | history = model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps, 44 | validation_data=valid_dataset, validation_steps=valid_steps) 45 | 46 | # Save TF2 model 47 | os.makedirs('./save/', exist_ok=True) 48 | model.save_pretrained('./save/') 49 | 50 | # Load the TensorFlow model in PyTorch for inspection 51 | pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True) 52 | 53 | # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task 54 | sentence_0 = 'This research was consistent with his findings.' 55 | sentence_1 = 'His findings were compatible with this research.' 56 | sentence_2 = 'His findings were not compatible with this research.' 57 | inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt') 58 | inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt') 59 | 60 | pred_1 = pytorch_model(**inputs_1)[0].argmax().item() 61 | pred_2 = pytorch_model(**inputs_2)[0].argmax().item() 62 | print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0') 63 | print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0') 64 | -------------------------------------------------------------------------------- /transformers/transformers/tests/tokenization_transfo_xl_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | import pytest 20 | from io import open 21 | 22 | from transformers import is_torch_available 23 | 24 | if is_torch_available(): 25 | import torch 26 | from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES 27 | else: 28 | pytestmark = pytest.mark.skip("Require Torch") # TODO: untangle Transfo-XL tokenizer from torch.load and torch.save 29 | 30 | from .tokenization_tests_commons import CommonTestCases 31 | 32 | class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester): 33 | 34 | tokenizer_class = TransfoXLTokenizer if is_torch_available() else None 35 | 36 | def setUp(self): 37 | super(TransfoXLTokenizationTest, self).setUp() 38 | 39 | vocab_tokens = [ 40 | "", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un", 41 | "running", ",", "low", "l", 42 | ] 43 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) 44 | with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer: 45 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) 46 | 47 | def get_tokenizer(self, **kwargs): 48 | kwargs['lower_case'] = True 49 | return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs) 50 | 51 | def get_input_output_texts(self): 52 | input_text = u" UNwanted , running" 53 | output_text = u" unwanted, running" 54 | return input_text, output_text 55 | 56 | def test_full_tokenizer(self): 57 | tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True) 58 | 59 | tokens = tokenizer.tokenize(u" UNwanted , running") 60 | self.assertListEqual(tokens, ["", "unwanted", ",", "running"]) 61 | 62 | self.assertListEqual( 63 | tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7]) 64 | 65 | def test_full_tokenizer_lower(self): 66 | tokenizer = TransfoXLTokenizer(lower_case=True) 67 | 68 | self.assertListEqual( 69 | tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "), 70 | ["hello", "!", "how", "are", "you", "?"]) 71 | 72 | def test_full_tokenizer_no_lower(self): 73 | tokenizer = TransfoXLTokenizer(lower_case=False) 74 | 75 | self.assertListEqual( 76 | tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "), 77 | ["HeLLo", "!", "how", "Are", "yoU", "?"]) 78 | 79 | 80 | if __name__ == '__main__': 81 | unittest.main() 82 | -------------------------------------------------------------------------------- /transformers/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | from io import open 21 | 22 | import torch 23 | 24 | from transformers import (CONFIG_NAME, WEIGHTS_NAME, 25 | GPT2Config, 26 | GPT2Model, 27 | load_tf_weights_in_gpt2) 28 | 29 | import logging 30 | logging.basicConfig(level=logging.INFO) 31 | 32 | 33 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): 34 | # Construct model 35 | if gpt2_config_file == "": 36 | config = GPT2Config() 37 | else: 38 | config = GPT2Config.from_json_file(gpt2_config_file) 39 | model = GPT2Model(config) 40 | 41 | # Load weights from numpy 42 | load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) 43 | 44 | # Save pytorch-model 45 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 46 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 47 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 48 | torch.save(model.state_dict(), pytorch_weights_dump_path) 49 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 50 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 51 | f.write(config.to_json_string()) 52 | 53 | 54 | if __name__ == "__main__": 55 | parser = argparse.ArgumentParser() 56 | ## Required parameters 57 | parser.add_argument("--gpt2_checkpoint_path", 58 | default = None, 59 | type = str, 60 | required = True, 61 | help = "Path to the TensorFlow checkpoint path.") 62 | parser.add_argument("--pytorch_dump_folder_path", 63 | default = None, 64 | type = str, 65 | required = True, 66 | help = "Path to the output PyTorch model.") 67 | parser.add_argument("--gpt2_config_file", 68 | default = "", 69 | type = str, 70 | help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" 71 | "This specifies the model architecture.") 72 | args = parser.parse_args() 73 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, 74 | args.gpt2_config_file, 75 | args.pytorch_dump_folder_path) 76 | -------------------------------------------------------------------------------- /transformers/examples/create_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import jsonlines 4 | from tqdm import tqdm 5 | from argparse import ArgumentParser 6 | from pathlib import Path 7 | import pprint 8 | import random 9 | 10 | 11 | 12 | def main(): 13 | parser = ArgumentParser() 14 | parser.add_argument('--data_path', type=Path, required=True) 15 | parser.add_argument('--generated_data_path', type=Path, required=True) 16 | parser.add_argument('--mix', action="store_true") 17 | parser.add_argument('--likelihood', action="store_true") 18 | parser.add_argument('--no_ref', action="store_true") 19 | parser.add_argument('--model', type=str, choices=["bert", "roberta"]) 20 | 21 | rules_stats = {} 22 | 23 | 24 | 25 | args = parser.parse_args() 26 | assert args.data_path.exists(), "data does not exist" 27 | 28 | MASK_TOKEN = "[MASK]" if args.model == "bert" else "" 29 | 30 | uuid = 0 31 | with open(args.data_path) as data_file, jsonlines.open(args.generated_data_path, 'w') as generated_file: 32 | for line in data_file.readlines(): 33 | columns = line.strip().split('\t') 34 | if (len(columns) == 4) and (columns[1] != "N/A"): 35 | random_word = columns[0].split()[random.randint(0, len(columns[0].split())-1)] 36 | random_prob = random.random() if args.mix else 0 37 | if random_prob < 0.5: 38 | doc = { 39 | "uuid": uuid, 40 | "matched_rule": columns[1], 41 | "masked_sentences": [((columns[0] + " ") if not args.no_ref else "") + ( (columns[2] if not args.likelihood else columns[0]).replace( random_word if (args.likelihood and columns[3] not in columns[0]) else columns[3] , MASK_TOKEN))], 42 | "obj_label": random_word if (args.likelihood and columns[3] not in columns[0]) else columns[3] 43 | } 44 | else: 45 | if args.likelihood: 46 | doc = { 47 | "uuid": uuid, 48 | "matched_rule": columns[1], 49 | "masked_sentences": [((columns[2] + " ") if not args.no_ref else "") + (columns[2]).replace(columns[3] , MASK_TOKEN)], 50 | "obj_label": columns[3] 51 | } 52 | else: 53 | doc = { 54 | "uuid": uuid, 55 | "matched_rule": columns[1], 56 | "masked_sentences": [((columns[2] + " ") if not args.no_ref else "") + ( (columns[0]).replace( random_word if (columns[3] not in columns[0]) else columns[3] , MASK_TOKEN))], 57 | "obj_label": random_word if (columns[3] not in columns[0]) else columns[3] 58 | } 59 | 60 | 61 | generated_file.write(doc) 62 | uuid += 1 63 | 64 | if columns[1] in rules_stats: 65 | rules_stats[columns[1]] += 1 66 | else: 67 | rules_stats[columns[1]] = 1 68 | print("Sample: ") 69 | print(doc) 70 | print("Done") 71 | rules_stats['total'] = sum(rules_stats.values()) 72 | pprint.pprint(rules_stats) 73 | 74 | 75 | # assert args.generated_data_path.is_dir(), "export path does not exist" 76 | 77 | if __name__ == '__main__': 78 | main() 79 | -------------------------------------------------------------------------------- /transformers/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | from io import open 21 | 22 | import torch 23 | 24 | from transformers import (CONFIG_NAME, WEIGHTS_NAME, 25 | OpenAIGPTConfig, 26 | OpenAIGPTModel, 27 | load_tf_weights_in_openai_gpt) 28 | 29 | import logging 30 | logging.basicConfig(level=logging.INFO) 31 | 32 | 33 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): 34 | # Construct model 35 | if openai_config_file == "": 36 | config = OpenAIGPTConfig() 37 | else: 38 | config = OpenAIGPTConfig.from_json_file(openai_config_file) 39 | model = OpenAIGPTModel(config) 40 | 41 | # Load weights from numpy 42 | load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path) 43 | 44 | # Save pytorch-model 45 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 46 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 47 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 48 | torch.save(model.state_dict(), pytorch_weights_dump_path) 49 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 50 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 51 | f.write(config.to_json_string()) 52 | 53 | 54 | if __name__ == "__main__": 55 | parser = argparse.ArgumentParser() 56 | ## Required parameters 57 | parser.add_argument("--openai_checkpoint_folder_path", 58 | default = None, 59 | type = str, 60 | required = True, 61 | help = "Path to the TensorFlow checkpoint path.") 62 | parser.add_argument("--pytorch_dump_folder_path", 63 | default = None, 64 | type = str, 65 | required = True, 66 | help = "Path to the output PyTorch model.") 67 | parser.add_argument("--openai_config_file", 68 | default = "", 69 | type = str, 70 | help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" 71 | "This specifies the model architecture.") 72 | args = parser.parse_args() 73 | convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path, 74 | args.openai_config_file, 75 | args.pytorch_dump_folder_path) 76 | -------------------------------------------------------------------------------- /transformers/transformers/data/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import csv 18 | import sys 19 | import logging 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | try: 24 | from scipy.stats import pearsonr, spearmanr 25 | from sklearn.metrics import matthews_corrcoef, f1_score 26 | _has_sklearn = True 27 | except (AttributeError, ImportError) as e: 28 | logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html") 29 | _has_sklearn = False 30 | 31 | def is_sklearn_available(): 32 | return _has_sklearn 33 | 34 | if _has_sklearn: 35 | 36 | def simple_accuracy(preds, labels): 37 | return (preds == labels).mean() 38 | 39 | 40 | def acc_and_f1(preds, labels): 41 | acc = simple_accuracy(preds, labels) 42 | f1 = f1_score(y_true=labels, y_pred=preds) 43 | return { 44 | "acc": acc, 45 | "f1": f1, 46 | "acc_and_f1": (acc + f1) / 2, 47 | } 48 | 49 | 50 | def pearson_and_spearman(preds, labels): 51 | pearson_corr = pearsonr(preds, labels)[0] 52 | spearman_corr = spearmanr(preds, labels)[0] 53 | return { 54 | "pearson": pearson_corr, 55 | "spearmanr": spearman_corr, 56 | "corr": (pearson_corr + spearman_corr) / 2, 57 | } 58 | 59 | 60 | def glue_compute_metrics(task_name, preds, labels): 61 | assert len(preds) == len(labels) 62 | if task_name == "cola": 63 | return {"mcc": matthews_corrcoef(labels, preds)} 64 | elif task_name == "sst-2": 65 | return {"acc": simple_accuracy(preds, labels)} 66 | elif task_name == "mrpc": 67 | return acc_and_f1(preds, labels) 68 | elif task_name == "sts-b": 69 | return pearson_and_spearman(preds, labels) 70 | elif task_name == "qqp": 71 | return acc_and_f1(preds, labels) 72 | elif task_name == "mnli": 73 | return {"acc": simple_accuracy(preds, labels)} 74 | elif task_name in ["mnli-contr", "mnli-neut", "mnli-entail", "mnli-neg", "mnli-neg-mm", "mnli_stress_neg_m", "mnli_stress_neg_mm"]: 75 | return {"acc": simple_accuracy(preds, labels)} 76 | elif task_name == "mnli-mm": 77 | return {"acc": simple_accuracy(preds, labels)} 78 | elif task_name == "qnli": 79 | return {"acc": simple_accuracy(preds, labels)} 80 | elif task_name == "rte": 81 | return {"acc": simple_accuracy(preds, labels)} 82 | elif task_name == "wnli": 83 | return {"acc": simple_accuracy(preds, labels)} 84 | elif task_name == "snli": 85 | return {"acc": simple_accuracy(preds, labels)} 86 | else: 87 | raise KeyError(task_name) 88 | -------------------------------------------------------------------------------- /transformers/transformers/tests/tokenization_xlm_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | import json 20 | 21 | from transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES 22 | 23 | from .tokenization_tests_commons import CommonTestCases 24 | 25 | class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester): 26 | 27 | tokenizer_class = XLMTokenizer 28 | 29 | def setUp(self): 30 | super(XLMTokenizationTest, self).setUp() 31 | 32 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 33 | vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", 34 | "w", "r", "t", 35 | "lo", "low", "er", 36 | "low", "lowest", "newer", "wider", ""] 37 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 38 | merges = ["l o 123", "lo w 1456", "e r 1789", ""] 39 | 40 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) 41 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) 42 | with open(self.vocab_file, "w") as fp: 43 | fp.write(json.dumps(vocab_tokens)) 44 | with open(self.merges_file, "w") as fp: 45 | fp.write("\n".join(merges)) 46 | 47 | def get_tokenizer(self, **kwargs): 48 | return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs) 49 | 50 | def get_input_output_texts(self): 51 | input_text = u"lower newer" 52 | output_text = u"lower newer" 53 | return input_text, output_text 54 | 55 | def test_full_tokenizer(self): 56 | """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """ 57 | tokenizer = XLMTokenizer(self.vocab_file, self.merges_file) 58 | 59 | text = "lower" 60 | bpe_tokens = ["low", "er"] 61 | tokens = tokenizer.tokenize(text) 62 | self.assertListEqual(tokens, bpe_tokens) 63 | 64 | input_tokens = tokens + [""] 65 | input_bpe_tokens = [14, 15, 20] 66 | self.assertListEqual( 67 | tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 68 | 69 | def test_sequence_builders(self): 70 | tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048") 71 | 72 | text = tokenizer.encode("sequence builders") 73 | text_2 = tokenizer.encode("multi-sequence build") 74 | 75 | encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) 76 | encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) 77 | 78 | assert encoded_sentence == [1] + text + [1] 79 | assert encoded_pair == [1] + text + [1] + text_2 + [1] 80 | 81 | if __name__ == '__main__': 82 | unittest.main() 83 | -------------------------------------------------------------------------------- /transformers/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import json 21 | from io import open 22 | 23 | import torch 24 | import numpy 25 | 26 | from transformers import CONFIG_NAME, WEIGHTS_NAME 27 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES 28 | 29 | import logging 30 | logging.basicConfig(level=logging.INFO) 31 | 32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path): 33 | # Load checkpoint 34 | chkpt = torch.load(xlm_checkpoint_path, map_location='cpu') 35 | 36 | state_dict = chkpt['model'] 37 | 38 | # We have the base model one level deeper than the original XLM repository 39 | two_levels_state_dict = {} 40 | for k, v in state_dict.items(): 41 | if 'pred_layer' in k: 42 | two_levels_state_dict[k] = v 43 | else: 44 | two_levels_state_dict['transformer.' + k] = v 45 | 46 | config = chkpt['params'] 47 | config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))) 48 | 49 | vocab = chkpt['dico_word2id'] 50 | vocab = dict((s + '' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items()) 51 | 52 | # Save pytorch-model 53 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 54 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 55 | pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['vocab_file'] 56 | 57 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 58 | torch.save(two_levels_state_dict, pytorch_weights_dump_path) 59 | 60 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 61 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 62 | f.write(json.dumps(config, indent=2) + "\n") 63 | 64 | print("Save vocab file to {}".format(pytorch_config_dump_path)) 65 | with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f: 66 | f.write(json.dumps(vocab, indent=2) + "\n") 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser() 71 | ## Required parameters 72 | parser.add_argument("--xlm_checkpoint_path", 73 | default = None, 74 | type = str, 75 | required = True, 76 | help = "Path the official PyTorch dump.") 77 | parser.add_argument("--pytorch_dump_folder_path", 78 | default = None, 79 | type = str, 80 | required = True, 81 | help = "Path to the output PyTorch model.") 82 | args = parser.parse_args() 83 | convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path) 84 | -------------------------------------------------------------------------------- /transformers/transformers/configuration_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ DistilBERT model configuration """ 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import sys 20 | import json 21 | import logging 22 | from io import open 23 | 24 | from .configuration_utils import PretrainedConfig 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 29 | 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", 30 | 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json" 31 | } 32 | 33 | 34 | class DistilBertConfig(PretrainedConfig): 35 | pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 36 | 37 | def __init__(self, 38 | vocab_size_or_config_json_file=30522, 39 | max_position_embeddings=512, 40 | sinusoidal_pos_embds=False, 41 | n_layers=6, 42 | n_heads=12, 43 | dim=768, 44 | hidden_dim=4*768, 45 | dropout=0.1, 46 | attention_dropout=0.1, 47 | activation='gelu', 48 | initializer_range=0.02, 49 | tie_weights_=True, 50 | qa_dropout=0.1, 51 | seq_classif_dropout=0.2, 52 | **kwargs): 53 | super(DistilBertConfig, self).__init__(**kwargs) 54 | 55 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 56 | and isinstance(vocab_size_or_config_json_file, unicode)): 57 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 58 | json_config = json.loads(reader.read()) 59 | for key, value in json_config.items(): 60 | self.__dict__[key] = value 61 | elif isinstance(vocab_size_or_config_json_file, int): 62 | self.vocab_size = vocab_size_or_config_json_file 63 | self.max_position_embeddings = max_position_embeddings 64 | self.sinusoidal_pos_embds = sinusoidal_pos_embds 65 | self.n_layers = n_layers 66 | self.n_heads = n_heads 67 | self.dim = dim 68 | self.hidden_dim = hidden_dim 69 | self.dropout = dropout 70 | self.attention_dropout = attention_dropout 71 | self.activation = activation 72 | self.initializer_range = initializer_range 73 | self.tie_weights_ = tie_weights_ 74 | self.qa_dropout = qa_dropout 75 | self.seq_classif_dropout = seq_classif_dropout 76 | else: 77 | raise ValueError("First argument must be either a vocabulary size (int)" 78 | " or the path to a pretrained model config file (str)") 79 | @property 80 | def hidden_size(self): 81 | return self.dim 82 | 83 | @property 84 | def num_attention_heads(self): 85 | return self.n_heads 86 | 87 | @property 88 | def num_hidden_layers(self): 89 | return self.n_layers 90 | -------------------------------------------------------------------------------- /transformers/.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build_py3_torch_and_tf: 4 | working_directory: ~/transformers 5 | docker: 6 | - image: circleci/python:3.5 7 | resource_class: xlarge 8 | parallelism: 1 9 | steps: 10 | - checkout 11 | - run: sudo pip install torch 12 | - run: sudo pip install tensorflow 13 | - run: sudo pip install --progress-bar off . 14 | - run: sudo pip install pytest codecov pytest-cov 15 | - run: sudo pip install tensorboardX scikit-learn 16 | - run: python -m pytest -sv ./transformers/tests/ --cov 17 | - run: codecov 18 | build_py3_torch: 19 | working_directory: ~/transformers 20 | docker: 21 | - image: circleci/python:3.5 22 | resource_class: xlarge 23 | parallelism: 1 24 | steps: 25 | - checkout 26 | - run: sudo pip install torch 27 | - run: sudo pip install --progress-bar off . 28 | - run: sudo pip install pytest codecov pytest-cov 29 | - run: sudo pip install tensorboardX scikit-learn 30 | - run: python -m pytest -sv ./transformers/tests/ --cov 31 | - run: python -m pytest -sv ./examples/ 32 | - run: codecov 33 | build_py3_tf: 34 | working_directory: ~/transformers 35 | docker: 36 | - image: circleci/python:3.5 37 | resource_class: xlarge 38 | parallelism: 1 39 | steps: 40 | - checkout 41 | - run: sudo pip install tensorflow 42 | - run: sudo pip install --progress-bar off . 43 | - run: sudo pip install pytest codecov pytest-cov 44 | - run: sudo pip install tensorboardX scikit-learn 45 | - run: python -m pytest -sv ./transformers/tests/ --cov 46 | - run: codecov 47 | build_py2_torch: 48 | working_directory: ~/transformers 49 | resource_class: large 50 | parallelism: 1 51 | docker: 52 | - image: circleci/python:2.7 53 | steps: 54 | - checkout 55 | - run: sudo pip install torch 56 | - run: sudo pip install --progress-bar off . 57 | - run: sudo pip install pytest codecov pytest-cov 58 | - run: python -m pytest -sv ./transformers/tests/ --cov 59 | - run: codecov 60 | build_py2_tf: 61 | working_directory: ~/transformers 62 | resource_class: large 63 | parallelism: 1 64 | docker: 65 | - image: circleci/python:2.7 66 | steps: 67 | - checkout 68 | - run: sudo pip install tensorflow 69 | - run: sudo pip install --progress-bar off . 70 | - run: sudo pip install pytest codecov pytest-cov 71 | - run: python -m pytest -sv ./transformers/tests/ --cov 72 | - run: codecov 73 | deploy_doc: 74 | working_directory: ~/transformers 75 | docker: 76 | - image: circleci/python:3.5 77 | steps: 78 | - add_ssh_keys: 79 | fingerprints: 80 | - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71" 81 | - checkout 82 | - run: sudo pip install --progress-bar off -r docs/requirements.txt 83 | - run: sudo pip install --progress-bar off -r requirements.txt 84 | - run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir 85 | workflow_filters: &workflow_filters 86 | filters: 87 | branches: 88 | only: 89 | - master 90 | workflows: 91 | version: 2 92 | build_and_test: 93 | jobs: 94 | - build_py3_torch_and_tf 95 | - build_py3_torch 96 | - build_py3_tf 97 | - build_py2_torch 98 | - build_py2_tf 99 | - deploy_doc: *workflow_filters -------------------------------------------------------------------------------- /transformers/examples/distillation/scripts/binarized_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | Preprocessing script before distillation. 17 | """ 18 | import argparse 19 | import pickle 20 | import random 21 | import time 22 | import numpy as np 23 | from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer 24 | import logging 25 | 26 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 27 | datefmt = '%m/%d/%Y %H:%M:%S', 28 | level = logging.INFO) 29 | logger = logging.getLogger(__name__) 30 | 31 | def main(): 32 | parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).") 33 | parser.add_argument('--file_path', type=str, default='data/dump.txt', 34 | help='The path to the data.') 35 | parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2']) 36 | parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased', 37 | help="The tokenizer to use.") 38 | parser.add_argument('--dump_file', type=str, default='data/dump', 39 | help='The dump file prefix.') 40 | args = parser.parse_args() 41 | 42 | 43 | logger.info(f'Loading Tokenizer ({args.tokenizer_name})') 44 | if args.tokenizer_type == 'bert': 45 | tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name) 46 | bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]` 47 | sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]` 48 | elif args.tokenizer_type == 'roberta': 49 | tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name) 50 | bos = tokenizer.special_tokens_map['cls_token'] # `` 51 | sep = tokenizer.special_tokens_map['sep_token'] # `` 52 | elif args.tokenizer_type == 'gpt2': 53 | tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name) 54 | bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>` 55 | sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>` 56 | 57 | logger.info(f'Loading text from {args.file_path}') 58 | with open(args.file_path, 'r', encoding='utf8') as fp: 59 | data = fp.readlines() 60 | 61 | 62 | logger.info(f'Start encoding') 63 | logger.info(f'{len(data)} examples to process.') 64 | 65 | rslt = [] 66 | iter = 0 67 | interval = 10000 68 | start = time.time() 69 | for text in data: 70 | text = f'{bos} {text.strip()} {sep}' 71 | token_ids = tokenizer.encode(text) 72 | rslt.append(token_ids) 73 | 74 | iter += 1 75 | if iter % interval == 0: 76 | end = time.time() 77 | logger.info(f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl') 78 | start = time.time() 79 | logger.info('Finished binarization') 80 | logger.info(f'{len(data)} examples processed.') 81 | 82 | 83 | dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle' 84 | rslt_ = [np.uint16(d) for d in rslt] 85 | random.shuffle(rslt_) 86 | logger.info(f'Dump to {dp_file}') 87 | with open(dp_file, 'wb') as handle: 88 | pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL) 89 | 90 | 91 | if __name__ == "__main__": 92 | main() 93 | -------------------------------------------------------------------------------- /transformers/transformers/log_utils.py: -------------------------------------------------------------------------------- 1 | 2 | from datetime import datetime 3 | from collections import defaultdict 4 | import threading 5 | import time 6 | import logging 7 | import os 8 | 9 | from tensorboardX import SummaryWriter 10 | from pandas import DataFrame 11 | from collections import defaultdict 12 | 13 | 14 | class AverageMeterSet: 15 | def __init__(self): 16 | self.meters = {} 17 | 18 | def __getitem__(self, key): 19 | return self.meters[key] 20 | 21 | def update_dict(self, name_val_dict, n=1): 22 | for name, val in name_val_dict.items(): 23 | self.update(name, val, n) 24 | 25 | def update(self, name, value, n=1): 26 | if not name in self.meters: 27 | self.meters[name] = AverageMeter() 28 | self.meters[name].update(value, n) 29 | 30 | def reset(self): 31 | for meter in self.meters.values(): 32 | meter.reset() 33 | 34 | def values(self, postfix=''): 35 | return {name + postfix: meter.val for name, meter in self.meters.items()} 36 | 37 | def averages(self, postfix='/avg'): 38 | return {name + postfix: meter.avg for name, meter in self.meters.items()} 39 | 40 | def sums(self, postfix='/sum'): 41 | return {name + postfix: meter.sum for name, meter in self.meters.items()} 42 | 43 | def counts(self, postfix='/count'): 44 | return {name + postfix: meter.count for name, meter in self.meters.items()} 45 | 46 | 47 | class AverageMeter: 48 | """Computes and stores the average and current value""" 49 | def __init__(self): 50 | self.reset() 51 | 52 | def reset(self): 53 | self.val = 0 54 | self.avg = 0 55 | self.sum = 0 56 | self.count = 0 57 | 58 | def update(self, val, n=1): 59 | self.val = val 60 | self.sum += val * n 61 | self.count += n 62 | self.avg = self.sum / self.count 63 | 64 | def __format__(self, format): 65 | return "{self.val:{format}} ({self.avg:{format}})".format(self=self, format=format) 66 | 67 | 68 | class TrainLog: 69 | """Saves training logs in Pandas msgpacks""" 70 | INCREMENTAL_UPDATE_TIME = 300 71 | 72 | def __init__(self, directory, name, init_tb=False): 73 | self.name = name 74 | self.log_file_path = "{}/{}.msgpack".format(directory, name) 75 | self._log = defaultdict(dict) 76 | self._log_lock = threading.RLock() 77 | self._last_update_time = time.time() - self.INCREMENTAL_UPDATE_TIME 78 | self._summary_writer = None 79 | self._meter_set = AverageMeterSet() 80 | if init_tb: 81 | self._summary_writer = SummaryWriter(directory, comment=name) 82 | self._tb_fields = None 83 | 84 | def set_tb_fields(self, tbf): 85 | self._tb_fields = set(tbf) 86 | 87 | def is_tb_key(self, key): 88 | if not self._tb_fields: 89 | return True 90 | for name in self._tb_fields: 91 | if name in key: 92 | return True 93 | return False 94 | 95 | def get_summary_writer(self): 96 | return self._summary_writer 97 | 98 | def set_summary_writer(self, tbw): 99 | self._summary_writer = tbw 100 | 101 | def record_single(self, step, column, value): 102 | self._record(step, {column: value}) 103 | 104 | def record(self, step, col_val_dict, tbstep=None): 105 | if not tbstep: 106 | tbstep = step 107 | if self._summary_writer: 108 | for name, value in col_val_dict.items(): 109 | if self.is_tb_key(name): 110 | self._summary_writer.add_scalar("{}/{}".format(self.name, name), value, tbstep) 111 | self._record(step, col_val_dict) 112 | 113 | def save(self): 114 | df = self._as_dataframe() 115 | df.to_msgpack(self.log_file_path, compress='zlib') 116 | 117 | def _record(self, step, col_val_dict): 118 | with self._log_lock: 119 | self._log[step].update(col_val_dict) 120 | if time.time() - self._last_update_time >= self.INCREMENTAL_UPDATE_TIME: 121 | self._last_update_time = time.time() 122 | # self.save() 123 | if self._summary_writer: 124 | self._summary_writer.file_writer.flush() 125 | 126 | def _as_dataframe(self): 127 | with self._log_lock: 128 | return DataFrame.from_dict(self._log, orient='index') 129 | -------------------------------------------------------------------------------- /transformers/transformers/tests/fixtures/sample_text.txt: -------------------------------------------------------------------------------- 1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত 2 | Text should be one-sentence-per-line, with empty lines between documents. 3 | This sample text is public domain and was randomly selected from Project Guttenberg. 4 | 5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors. 6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity. 7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them. 8 | "Cass" Beard had risen early that morning, but not with a view to discovery. 9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets. 10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency. 11 | This was nearly opposite. 12 | Mr. Cassius crossed the highway, and stopped suddenly. 13 | Something glittered in the nearest red pool before him. 14 | Gold, surely! 15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring. 16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass." 17 | Like most of his fellow gold-seekers, Cass was superstitious. 18 | 19 | The fountain of classic wisdom, Hypatia herself. 20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge. 21 | From my youth I felt in me a soul above the matter-entangled herd. 22 | She revealed to me the glorious fact, that I am a spark of Divinity itself. 23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's. 24 | There is a philosophic pleasure in opening one's treasures to the modest young. 25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street. 26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide; 27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind. 28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now. 29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert; 30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts. 31 | At last they reached the quay at the opposite end of the street; 32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers. 33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him. 34 | -------------------------------------------------------------------------------- /transformers/transformers/tests/tokenization_roberta_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import json 19 | import unittest 20 | from io import open 21 | 22 | from transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES 23 | from .tokenization_tests_commons import CommonTestCases 24 | 25 | 26 | class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): 27 | tokenizer_class = RobertaTokenizer 28 | 29 | def setUp(self): 30 | super(RobertaTokenizationTest, self).setUp() 31 | 32 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 33 | vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", 34 | "\u0120", "\u0120l", "\u0120n", 35 | "\u0120lo", "\u0120low", "er", 36 | "\u0120lowest", "\u0120newer", "\u0120wider", ""] 37 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 38 | merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] 39 | self.special_tokens_map = {"unk_token": ""} 40 | 41 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) 42 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) 43 | with open(self.vocab_file, "w", encoding="utf-8") as fp: 44 | fp.write(json.dumps(vocab_tokens) + "\n") 45 | with open(self.merges_file, "w", encoding="utf-8") as fp: 46 | fp.write("\n".join(merges)) 47 | 48 | def get_tokenizer(self, **kwargs): 49 | kwargs.update(self.special_tokens_map) 50 | return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs) 51 | 52 | def get_input_output_texts(self): 53 | input_text = u"lower newer" 54 | output_text = u"lower newer" 55 | return input_text, output_text 56 | 57 | def test_full_tokenizer(self): 58 | tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) 59 | text = "lower newer" 60 | bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"] 61 | tokens = tokenizer.tokenize(text, add_prefix_space=True) 62 | self.assertListEqual(tokens, bpe_tokens) 63 | 64 | input_tokens = tokens + [tokenizer.unk_token] 65 | input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] 66 | self.assertListEqual( 67 | tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 68 | 69 | def roberta_dict_integration_testing(self): 70 | tokenizer = self.get_tokenizer() 71 | 72 | self.assertListEqual( 73 | tokenizer.encode('Hello world!'), 74 | [0, 31414, 232, 328, 2] 75 | ) 76 | self.assertListEqual( 77 | tokenizer.encode('Hello world! cécé herlolip 418'), 78 | [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2] 79 | ) 80 | 81 | def test_sequence_builders(self): 82 | tokenizer = RobertaTokenizer.from_pretrained("roberta-base") 83 | 84 | text = tokenizer.encode("sequence builders") 85 | text_2 = tokenizer.encode("multi-sequence build") 86 | 87 | encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True) 88 | encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True) 89 | 90 | encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) 91 | encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) 92 | 93 | assert encoded_sentence == encoded_text_from_decode 94 | assert encoded_pair == encoded_pair_from_decode 95 | 96 | 97 | if __name__ == '__main__': 98 | unittest.main() 99 | -------------------------------------------------------------------------------- /transformers/transformers/tests/modeling_tf_auto_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import shutil 21 | import pytest 22 | import logging 23 | 24 | from transformers import is_tf_available 25 | 26 | if is_tf_available(): 27 | from transformers import (AutoConfig, BertConfig, 28 | TFAutoModel, TFBertModel, 29 | TFAutoModelWithLMHead, TFBertForMaskedLM, 30 | TFAutoModelForSequenceClassification, TFBertForSequenceClassification, 31 | TFAutoModelForQuestionAnswering, TFBertForQuestionAnswering) 32 | from transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP 33 | 34 | from .modeling_common_test import (CommonTestCases, ids_tensor) 35 | from .configuration_common_test import ConfigTester 36 | else: 37 | pytestmark = pytest.mark.skip("Require TensorFlow") 38 | 39 | 40 | class TFAutoModelTest(unittest.TestCase): 41 | def test_model_from_pretrained(self): 42 | import h5py 43 | self.assertTrue(h5py.version.hdf5_version.startswith("1.10")) 44 | 45 | logging.basicConfig(level=logging.INFO) 46 | # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 47 | for model_name in ['bert-base-uncased']: 48 | config = AutoConfig.from_pretrained(model_name, force_download=True) 49 | self.assertIsNotNone(config) 50 | self.assertIsInstance(config, BertConfig) 51 | 52 | model = TFAutoModel.from_pretrained(model_name, force_download=True) 53 | self.assertIsNotNone(model) 54 | self.assertIsInstance(model, TFBertModel) 55 | 56 | def test_lmhead_model_from_pretrained(self): 57 | logging.basicConfig(level=logging.INFO) 58 | # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 59 | for model_name in ['bert-base-uncased']: 60 | config = AutoConfig.from_pretrained(model_name, force_download=True) 61 | self.assertIsNotNone(config) 62 | self.assertIsInstance(config, BertConfig) 63 | 64 | model = TFAutoModelWithLMHead.from_pretrained(model_name, force_download=True) 65 | self.assertIsNotNone(model) 66 | self.assertIsInstance(model, TFBertForMaskedLM) 67 | 68 | def test_sequence_classification_model_from_pretrained(self): 69 | logging.basicConfig(level=logging.INFO) 70 | # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 71 | for model_name in ['bert-base-uncased']: 72 | config = AutoConfig.from_pretrained(model_name, force_download=True) 73 | self.assertIsNotNone(config) 74 | self.assertIsInstance(config, BertConfig) 75 | 76 | model = TFAutoModelForSequenceClassification.from_pretrained(model_name, force_download=True) 77 | self.assertIsNotNone(model) 78 | self.assertIsInstance(model, TFBertForSequenceClassification) 79 | 80 | def test_question_answering_model_from_pretrained(self): 81 | logging.basicConfig(level=logging.INFO) 82 | # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 83 | for model_name in ['bert-base-uncased']: 84 | config = AutoConfig.from_pretrained(model_name, force_download=True) 85 | self.assertIsNotNone(config) 86 | self.assertIsInstance(config, BertConfig) 87 | 88 | model = TFAutoModelForQuestionAnswering.from_pretrained(model_name, force_download=True) 89 | self.assertIsNotNone(model) 90 | self.assertIsInstance(model, TFBertForQuestionAnswering) 91 | 92 | 93 | if __name__ == "__main__": 94 | unittest.main() 95 | -------------------------------------------------------------------------------- /transformers/transformers/tests/modeling_auto_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import shutil 21 | import pytest 22 | import logging 23 | 24 | from transformers import is_torch_available 25 | 26 | if is_torch_available(): 27 | from transformers import (AutoConfig, BertConfig, 28 | AutoModel, BertModel, 29 | AutoModelWithLMHead, BertForMaskedLM, 30 | AutoModelForSequenceClassification, BertForSequenceClassification, 31 | AutoModelForQuestionAnswering, BertForQuestionAnswering) 32 | from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP 33 | 34 | from .modeling_common_test import (CommonTestCases, ids_tensor) 35 | from .configuration_common_test import ConfigTester 36 | else: 37 | pytestmark = pytest.mark.skip("Require Torch") 38 | 39 | 40 | class AutoModelTest(unittest.TestCase): 41 | def test_model_from_pretrained(self): 42 | logging.basicConfig(level=logging.INFO) 43 | for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 44 | config = AutoConfig.from_pretrained(model_name) 45 | self.assertIsNotNone(config) 46 | self.assertIsInstance(config, BertConfig) 47 | 48 | model = AutoModel.from_pretrained(model_name) 49 | model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True) 50 | self.assertIsNotNone(model) 51 | self.assertIsInstance(model, BertModel) 52 | for value in loading_info.values(): 53 | self.assertEqual(len(value), 0) 54 | 55 | def test_lmhead_model_from_pretrained(self): 56 | logging.basicConfig(level=logging.INFO) 57 | for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 58 | config = AutoConfig.from_pretrained(model_name) 59 | self.assertIsNotNone(config) 60 | self.assertIsInstance(config, BertConfig) 61 | 62 | model = AutoModelWithLMHead.from_pretrained(model_name) 63 | model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True) 64 | self.assertIsNotNone(model) 65 | self.assertIsInstance(model, BertForMaskedLM) 66 | 67 | def test_sequence_classification_model_from_pretrained(self): 68 | logging.basicConfig(level=logging.INFO) 69 | for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 70 | config = AutoConfig.from_pretrained(model_name) 71 | self.assertIsNotNone(config) 72 | self.assertIsInstance(config, BertConfig) 73 | 74 | model = AutoModelForSequenceClassification.from_pretrained(model_name) 75 | model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True) 76 | self.assertIsNotNone(model) 77 | self.assertIsInstance(model, BertForSequenceClassification) 78 | 79 | def test_question_answering_model_from_pretrained(self): 80 | logging.basicConfig(level=logging.INFO) 81 | for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 82 | config = AutoConfig.from_pretrained(model_name) 83 | self.assertIsNotNone(config) 84 | self.assertIsInstance(config, BertConfig) 85 | 86 | model = AutoModelForQuestionAnswering.from_pretrained(model_name) 87 | model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True) 88 | self.assertIsNotNone(model) 89 | self.assertIsInstance(model, BertForQuestionAnswering) 90 | 91 | 92 | if __name__ == "__main__": 93 | unittest.main() 94 | -------------------------------------------------------------------------------- /transformers/examples/distillation/scripts/extract_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | Preprocessing script before training DistilBERT. 17 | Specific to BERT -> DistilBERT. 18 | """ 19 | from transformers import BertForMaskedLM, RobertaForMaskedLM 20 | import torch 21 | import argparse 22 | 23 | if __name__ == '__main__': 24 | parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation") 25 | parser.add_argument("--model_type", default="bert", choices=["bert"]) 26 | parser.add_argument("--model_name", default='bert-base-uncased', type=str) 27 | parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str) 28 | parser.add_argument("--vocab_transform", action='store_true') 29 | args = parser.parse_args() 30 | 31 | 32 | if args.model_type == 'bert': 33 | model = BertForMaskedLM.from_pretrained(args.model_name) 34 | prefix = 'bert' 35 | else: 36 | raise ValueError(f'args.model_type should be "bert".') 37 | 38 | state_dict = model.state_dict() 39 | compressed_sd = {} 40 | 41 | for w in ['word_embeddings', 'position_embeddings']: 42 | compressed_sd[f'distilbert.embeddings.{w}.weight'] = \ 43 | state_dict[f'{prefix}.embeddings.{w}.weight'] 44 | for w in ['weight', 'bias']: 45 | compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \ 46 | state_dict[f'{prefix}.embeddings.LayerNorm.{w}'] 47 | 48 | std_idx = 0 49 | for teacher_idx in [0, 2, 4, 7, 9, 11]: 50 | for w in ['weight', 'bias']: 51 | compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \ 52 | state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}'] 53 | compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \ 54 | state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}'] 55 | compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \ 56 | state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}'] 57 | 58 | compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \ 59 | state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}'] 60 | compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \ 61 | state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}'] 62 | 63 | compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \ 64 | state_dict[f'{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}'] 65 | compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \ 66 | state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}'] 67 | compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \ 68 | state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}'] 69 | std_idx += 1 70 | 71 | compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight'] 72 | compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias'] 73 | if args.vocab_transform: 74 | for w in ['weight', 'bias']: 75 | compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}'] 76 | compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}'] 77 | 78 | print(f'N layers selected for distillation: {std_idx}') 79 | print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}') 80 | 81 | print(f'Save transfered checkpoint to {args.dump_checkpoint}.') 82 | torch.save(compressed_sd, args.dump_checkpoint) 83 | -------------------------------------------------------------------------------- /transformers/examples/lm_training/dist_comms.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import torch 5 | import torch.distributed as dist 6 | from fp16_utils import maybe_half 7 | 8 | 9 | MAX_GROUP_SIZE = 32 10 | current_process_group = None 11 | 12 | 13 | def init_distributed_training(local_rank, port_idx=0): 14 | ports = ['8787', '8686', '8585', '8484'] 15 | os.environ['MASTER_ADDR'] = 'localhost' 16 | os.environ['MASTER_PORT'] = ports[port_idx] 17 | torch.cuda.set_device(local_rank) 18 | torch.distributed.init_process_group(backend='nccl', 19 | init_method='env://', 20 | rank=local_rank) 21 | create_groups() 22 | 23 | 24 | def create_groups(): 25 | global current_process_group 26 | # collect some useful information 27 | world_size = dist.get_world_size() 28 | my_rank = dist.get_rank() 29 | # assign gpus to groups 30 | group, groups = [], [] 31 | for i in range(world_size): 32 | group.append(i) 33 | if (len(group) == MAX_GROUP_SIZE) or (i == (world_size - 1)): 34 | groups.append(group) 35 | group = [] 36 | # tell pytorch about each group 37 | for i, group in enumerate(groups): 38 | process_group = dist.new_group(ranks=group) 39 | if my_rank in group: 40 | # record which process group includes current process 41 | current_process_group = process_group 42 | if my_rank == 0: 43 | print('Adding distributed group {}, including GPUs [{}...{}]' 44 | .format(i, group[0], group[-1])) 45 | 46 | 47 | def get_group(): 48 | return current_process_group 49 | 50 | 51 | def get_group_idx(): 52 | return dist.get_rank() // MAX_GROUP_SIZE 53 | 54 | 55 | def get_group_size(): 56 | return get_group().size() 57 | 58 | 59 | def get_group_rank(): 60 | return dist.get_rank(get_group()) 61 | 62 | 63 | def reduce_tensor(tensor): 64 | rt = tensor.clone() 65 | dist.all_reduce(rt, op=dist.ReduceOp.SUM) 66 | rt /= dist.get_world_size() 67 | return rt 68 | 69 | 70 | def reduce_scalar(scalar): 71 | if hasattr(scalar, 'device'): 72 | scalar = scalar.item() 73 | reduced_scalar = reduce_tensor(torch.tensor(scalar).cuda()).item() 74 | return reduced_scalar 75 | 76 | 77 | def all_gather_no_grad(tensor, *output_list): 78 | dist.all_gather(list(output_list), tensor, get_group()) 79 | return tuple(output_list) 80 | 81 | 82 | class AllGatherWithGrads(torch.autograd.Function): 83 | @staticmethod 84 | def forward(ctx, tensor, *output_list): 85 | dist.all_gather(list(output_list), tensor, get_group()) 86 | return tuple(output_list) 87 | 88 | @staticmethod 89 | def backward(ctx, *grad_outputs): 90 | # collect some info about current process and process groups 91 | group = get_group() 92 | group_rank = get_group_rank() 93 | group_size = get_group_size() 94 | global_rank = dist.get_rank() 95 | # figure out which processes are in group with current process 96 | start_idx = get_group_idx() * MAX_GROUP_SIZE 97 | group_idx = np.arange(start_idx, start_idx + group_size) 98 | # gather gradient info from all processes in current process group 99 | handles = [] 100 | t_grad = grad_outputs[group_rank].clone() 101 | for i in range(group_size): 102 | if i == group_rank: 103 | # gradient info from self 104 | hdl = dist.reduce(t_grad.contiguous(), global_rank, 105 | group=group, async_op=True) 106 | else: 107 | # gradient info from other group members 108 | hdl = dist.reduce(grad_outputs[i].contiguous(), group_idx[i], 109 | group=group, async_op=True) 110 | handles.append(hdl) 111 | # wait for async ops to finish 112 | for h in handles: 113 | h.wait() 114 | return (t_grad,) + grad_outputs 115 | 116 | 117 | def all_gather_local_group(*tensors): 118 | all_gather_fn = AllGatherWithGrads.apply 119 | # all_gather_fn = all_gather_no_grad 120 | t_out = [] 121 | for t in tensors: 122 | out = [torch.empty_like(t) for i in range(get_group_size())] 123 | out = all_gather_fn(t, *out) 124 | out = torch.cat(out) 125 | out = maybe_half(out) 126 | t_out.append(out) 127 | return t_out 128 | -------------------------------------------------------------------------------- /transformers/docs/source/multilingual.rst: -------------------------------------------------------------------------------- 1 | Multi-lingual models 2 | ================================================ 3 | 4 | Most of the models available in this library are mono-lingual models (English, Chinese and German). A few 5 | multi-lingual models are available and have a different mechanisms than mono-lingual models. 6 | This page details the usage of these models. 7 | 8 | The two models that currently support multiple languages are BERT and XLM. 9 | 10 | XLM 11 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 12 | 13 | XLM has a total of 10 different checkpoints, only one of which is mono-lingual. The 9 remaining model checkpoints can 14 | be split in two categories: the checkpoints that make use of language embeddings, and those that don't 15 | 16 | XLM & Language Embeddings 17 | ------------------------------------------------ 18 | 19 | This section concerns the following checkpoints: 20 | 21 | - ``xlm-mlm-ende-1024`` (Masked language modeling, English-German) 22 | - ``xlm-mlm-enfr-1024`` (Masked language modeling, English-French) 23 | - ``xlm-mlm-enro-1024`` (Masked language modeling, English-Romanian) 24 | - ``xlm-mlm-xnli15-1024`` (Masked language modeling, XNLI languages) 25 | - ``xlm-mlm-tlm-xnli15-1024`` (Masked language modeling + Translation, XNLI languages) 26 | - ``xlm-clm-enfr-1024`` (Causal language modeling, English-French) 27 | - ``xlm-clm-ende-1024`` (Causal language modeling, English-German) 28 | 29 | These checkpoints require language embeddings that will specify the language used at inference time. These language 30 | embeddings are represented as a tensor that is of the same shape as the input ids passed to the model. The values in 31 | these tensors depend on the language used and are identifiable using the ``lang2id`` and ``id2lang`` attributes 32 | from the tokenizer. 33 | 34 | Here is an example using the ``xlm-clm-enfr-1024`` checkpoint (Causal language modeling, English-French): 35 | 36 | 37 | .. code-block:: 38 | 39 | import torch 40 | from transformers import XLMTokenizer, XLMWithLMHeadModel 41 | 42 | tokenizer = XLMTokenizer.from_pretrained("xlm-clm-1024-enfr") 43 | 44 | 45 | The different languages this model/tokenizer handles, as well as the ids of these languages are visible using the 46 | ``lang2id`` attribute: 47 | 48 | .. code-block:: 49 | 50 | print(tokenizer.lang2id) # {'en': 0, 'fr': 1} 51 | 52 | 53 | These ids should be used when passing a language parameter during a model pass. Let's define our inputs: 54 | 55 | .. code-block:: 56 | 57 | input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1 58 | 59 | 60 | We should now define the language embedding by using the previously defined language id. We want to create a tensor 61 | filled with the appropriate language ids, of the same size as input_ids. For english, the id is 0: 62 | 63 | .. code-block:: 64 | 65 | language_id = tokenizer.lang2id['en'] # 0 66 | langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0]) 67 | 68 | # We reshape it to be of size (batch_size, sequence_length) 69 | langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1) 70 | 71 | 72 | You can then feed it all as input to your model: 73 | 74 | .. code-block:: 75 | 76 | outputs = model(input_ids, langs=langs) 77 | 78 | 79 | The example `run_generation.py `__ 80 | can generate text using the CLM checkpoints from XLM, using the language embeddings. 81 | 82 | XLM without Language Embeddings 83 | ------------------------------------------------ 84 | 85 | This section concerns the following checkpoints: 86 | 87 | - ``xlm-mlm-17-1280`` (Masked language modeling, 17 languages) 88 | - ``xlm-mlm-100-1280`` (Masked language modeling, 100 languages) 89 | 90 | These checkpoints do not require language embeddings at inference time. These models are used to have generic 91 | sentence representations, differently from previously-mentioned XLM checkpoints. 92 | 93 | 94 | BERT 95 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 96 | 97 | BERT has two checkpoints that can be used for multi-lingual tasks: 98 | 99 | - ``bert-base-multilingual-uncased`` (Masked language modeling + Next sentence prediction, 102 languages) 100 | - ``bert-base-multilingual-cased`` (Masked language modeling + Next sentence prediction, 104 languages) 101 | 102 | These checkpoints do not require language embeddings at inference time. They should identify the language 103 | used in the context and infer accordingly. -------------------------------------------------------------------------------- /transformers/examples/distillation/scripts/extract.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | Preprocessing script before training the distilled model. 17 | Specific to RoBERTa -> DistilRoBERTa and GPT2 -> DistilGPT2. 18 | """ 19 | from transformers import BertForMaskedLM, RobertaForMaskedLM, GPT2LMHeadModel 20 | import torch 21 | import argparse 22 | 23 | if __name__ == '__main__': 24 | parser = argparse.ArgumentParser(description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation") 25 | parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"]) 26 | parser.add_argument("--model_name", default='roberta-large', type=str) 27 | parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_roberta_048131723.pth', type=str) 28 | parser.add_argument("--vocab_transform", action='store_true') 29 | args = parser.parse_args() 30 | 31 | 32 | if args.model_type == 'roberta': 33 | model = RobertaForMaskedLM.from_pretrained(args.model_name) 34 | prefix = 'roberta' 35 | elif args.model_type == 'gpt2': 36 | model = GPT2LMHeadModel.from_pretrained(args.model_name) 37 | prefix = 'transformer' 38 | 39 | state_dict = model.state_dict() 40 | compressed_sd = {} 41 | 42 | ### Embeddings ### 43 | if args.model_type == 'gpt2': 44 | for param_name in ['wte.weight', 'wpe.weight']: 45 | compressed_sd[f'{prefix}.{param_name}'] = state_dict[f'{prefix}.{param_name}'] 46 | else: 47 | for w in ['word_embeddings', 'position_embeddings', 'token_type_embeddings']: 48 | param_name = f'{prefix}.embeddings.{w}.weight' 49 | compressed_sd[param_name] = state_dict[param_name] 50 | for w in ['weight', 'bias']: 51 | param_name = f'{prefix}.embeddings.LayerNorm.{w}' 52 | compressed_sd[param_name] = state_dict[param_name] 53 | 54 | ### Transformer Blocks ### 55 | std_idx = 0 56 | for teacher_idx in [0, 2, 4, 7, 9, 11]: 57 | if args.model_type == 'gpt2': 58 | for layer in ['ln_1', 'attn.c_attn', 'attn.c_proj', 'ln_2', 'mlp.c_fc', 'mlp.c_proj']: 59 | for w in ['weight', 'bias']: 60 | compressed_sd[f'{prefix}.h.{std_idx}.{layer}.{w}'] = \ 61 | state_dict[f'{prefix}.h.{teacher_idx}.{layer}.{w}'] 62 | compressed_sd[f'{prefix}.h.{std_idx}.attn.bias'] = state_dict[f'{prefix}.h.{teacher_idx}.attn.bias'] 63 | else: 64 | for layer in ['attention.self.query', 'attention.self.key', 'attention.self.value', 65 | 'attention.output.dense', 'attention.output.LayerNorm', 66 | 'intermediate.dense', 'output.dense', 'output.LayerNorm']: 67 | for w in ['weight', 'bias']: 68 | compressed_sd[f'{prefix}.encoder.layer.{std_idx}.{layer}.{w}'] = \ 69 | state_dict[f'{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}'] 70 | std_idx += 1 71 | 72 | ### Language Modeling Head ###s 73 | if args.model_type == 'roberta': 74 | for layer in ['lm_head.decoder.weight', 'lm_head.bias']: 75 | compressed_sd[f'{layer}'] = state_dict[f'{layer}'] 76 | if args.vocab_transform: 77 | for w in ['weight', 'bias']: 78 | compressed_sd[f'lm_head.dense.{w}'] = state_dict[f'lm_head.dense.{w}'] 79 | compressed_sd[f'lm_head.layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}'] 80 | elif args.model_type == 'gpt2': 81 | for w in ['weight', 'bias']: 82 | compressed_sd[f'{prefix}.ln_f.{w}'] = state_dict[f'{prefix}.ln_f.{w}'] 83 | compressed_sd[f'lm_head.weight'] = state_dict[f'lm_head.weight'] 84 | 85 | print(f'N layers selected for distillation: {std_idx}') 86 | print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}') 87 | 88 | print(f'Save transfered checkpoint to {args.dump_checkpoint}.') 89 | torch.save(compressed_sd, args.dump_checkpoint) 90 | -------------------------------------------------------------------------------- /transformers/examples/test_examples.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 HuggingFace Inc.. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import sys 20 | import unittest 21 | import argparse 22 | import logging 23 | 24 | try: 25 | # python 3.4+ can use builtin unittest.mock instead of mock package 26 | from unittest.mock import patch 27 | except ImportError: 28 | from mock import patch 29 | 30 | import run_glue 31 | import run_squad 32 | import run_generation 33 | 34 | logging.basicConfig(level=logging.DEBUG) 35 | 36 | logger = logging.getLogger() 37 | 38 | def get_setup_file(): 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument('-f') 41 | args = parser.parse_args() 42 | return args.f 43 | 44 | class ExamplesTests(unittest.TestCase): 45 | 46 | def test_run_glue(self): 47 | stream_handler = logging.StreamHandler(sys.stdout) 48 | logger.addHandler(stream_handler) 49 | 50 | testargs = ["run_glue.py", 51 | "--data_dir=./examples/tests_samples/MRPC/", 52 | "--task_name=mrpc", 53 | "--do_train", 54 | "--do_eval", 55 | "--output_dir=./examples/tests_samples/temp_dir", 56 | "--per_gpu_train_batch_size=2", 57 | "--per_gpu_eval_batch_size=1", 58 | "--learning_rate=1e-4", 59 | "--max_steps=10", 60 | "--warmup_steps=2", 61 | "--overwrite_output_dir", 62 | "--seed=42"] 63 | model_type, model_name = ("--model_type=bert", 64 | "--model_name_or_path=bert-base-uncased") 65 | with patch.object(sys, 'argv', testargs + [model_type, model_name]): 66 | result = run_glue.main() 67 | for value in result.values(): 68 | self.assertGreaterEqual(value, 0.75) 69 | 70 | def test_run_squad(self): 71 | stream_handler = logging.StreamHandler(sys.stdout) 72 | logger.addHandler(stream_handler) 73 | 74 | testargs = ["run_squad.py", 75 | "--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json", 76 | "--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json", 77 | "--model_name=bert-base-uncased", 78 | "--output_dir=./examples/tests_samples/temp_dir", 79 | "--max_steps=10", 80 | "--warmup_steps=2", 81 | "--do_train", 82 | "--do_eval", 83 | "--version_2_with_negative", 84 | "--learning_rate=2e-4", 85 | "--per_gpu_train_batch_size=2", 86 | "--per_gpu_eval_batch_size=1", 87 | "--overwrite_output_dir", 88 | "--seed=42"] 89 | model_type, model_name = ("--model_type=bert", 90 | "--model_name_or_path=bert-base-uncased") 91 | with patch.object(sys, 'argv', testargs + [model_type, model_name]): 92 | result = run_squad.main() 93 | self.assertGreaterEqual(result['f1'], 30) 94 | self.assertGreaterEqual(result['exact'], 30) 95 | 96 | def test_generation(self): 97 | stream_handler = logging.StreamHandler(sys.stdout) 98 | logger.addHandler(stream_handler) 99 | 100 | testargs = ["run_generation.py", 101 | "--prompt=Hello", 102 | "--length=10", 103 | "--seed=42"] 104 | model_type, model_name = ("--model_type=openai-gpt", 105 | "--model_name_or_path=openai-gpt") 106 | with patch.object(sys, 'argv', testargs + [model_type, model_name]): 107 | result = run_generation.main() 108 | self.assertGreaterEqual(len(result), 10) 109 | 110 | if __name__ == "__main__": 111 | unittest.main() 112 | -------------------------------------------------------------------------------- /transformers/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import argparse 23 | import torch 24 | 25 | from transformers import (CONFIG_NAME, WEIGHTS_NAME, 26 | XLNetConfig, 27 | XLNetLMHeadModel, XLNetForQuestionAnswering, 28 | XLNetForSequenceClassification, 29 | load_tf_weights_in_xlnet) 30 | 31 | GLUE_TASKS_NUM_LABELS = { 32 | "cola": 2, 33 | "mnli": 3, 34 | "mrpc": 2, 35 | "sst-2": 2, 36 | "sts-b": 1, 37 | "qqp": 2, 38 | "qnli": 2, 39 | "rte": 2, 40 | "wnli": 2, 41 | } 42 | 43 | import logging 44 | logging.basicConfig(level=logging.INFO) 45 | 46 | def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None): 47 | # Initialise PyTorch model 48 | config = XLNetConfig.from_json_file(bert_config_file) 49 | 50 | finetuning_task = finetuning_task.lower() if finetuning_task is not None else "" 51 | if finetuning_task in GLUE_TASKS_NUM_LABELS: 52 | print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config))) 53 | config.finetuning_task = finetuning_task 54 | config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task] 55 | model = XLNetForSequenceClassification(config) 56 | elif 'squad' in finetuning_task: 57 | config.finetuning_task = finetuning_task 58 | model = XLNetForQuestionAnswering(config) 59 | else: 60 | model = XLNetLMHeadModel(config) 61 | 62 | # Load weights from tf checkpoint 63 | load_tf_weights_in_xlnet(model, config, tf_checkpoint_path) 64 | 65 | # Save pytorch-model 66 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) 67 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) 68 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) 69 | torch.save(model.state_dict(), pytorch_weights_dump_path) 70 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) 71 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 72 | f.write(config.to_json_string()) 73 | 74 | 75 | if __name__ == "__main__": 76 | parser = argparse.ArgumentParser() 77 | ## Required parameters 78 | parser.add_argument("--tf_checkpoint_path", 79 | default = None, 80 | type = str, 81 | required = True, 82 | help = "Path to the TensorFlow checkpoint path.") 83 | parser.add_argument("--xlnet_config_file", 84 | default = None, 85 | type = str, 86 | required = True, 87 | help = "The config json file corresponding to the pre-trained XLNet model. \n" 88 | "This specifies the model architecture.") 89 | parser.add_argument("--pytorch_dump_folder_path", 90 | default = None, 91 | type = str, 92 | required = True, 93 | help = "Path to the folder to store the PyTorch model or dataset/vocab.") 94 | parser.add_argument("--finetuning_task", 95 | default = None, 96 | type = str, 97 | help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned") 98 | args = parser.parse_args() 99 | print(args) 100 | 101 | convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path, 102 | args.xlnet_config_file, 103 | args.pytorch_dump_folder_path, 104 | args.finetuning_task) 105 | -------------------------------------------------------------------------------- /transformers/examples/distillation/grouped_batch_sampler.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Adapted from PyTorch Vision (https://github.com/pytorch/vision/blob/master/references/detection/group_by_aspect_ratio.py) 16 | """ 17 | import bisect 18 | import copy 19 | from collections import defaultdict 20 | import numpy as np 21 | 22 | from torch.utils.data.sampler import BatchSampler, Sampler 23 | 24 | from utils import logger 25 | 26 | def _quantize(x, bins): 27 | bins = copy.deepcopy(bins) 28 | bins = sorted(bins) 29 | quantized = list(map(lambda y: bisect.bisect_right(bins, y), x)) 30 | return quantized 31 | 32 | def create_lengths_groups(lengths, k=0): 33 | bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10] 34 | groups = _quantize(lengths, bins) 35 | # count number of elements per group 36 | counts = np.unique(groups, return_counts=True)[1] 37 | fbins = [0] + bins + [np.inf] 38 | logger.info("Using {} as bins for aspect lengths quantization".format(fbins)) 39 | logger.info("Count of instances per bin: {}".format(counts)) 40 | return groups 41 | 42 | class GroupedBatchSampler(BatchSampler): 43 | """ 44 | Wraps another sampler to yield a mini-batch of indices. 45 | It enforces that the batch only contain elements from the same group. 46 | It also tries to provide mini-batches which follows an ordering which is 47 | as close as possible to the ordering from the original sampler. 48 | Arguments: 49 | sampler (Sampler): Base sampler. 50 | group_ids (list[int]): If the sampler produces indices in range [0, N), 51 | `group_ids` must be a list of `N` ints which contains the group id of each sample. 52 | The group ids must be a continuous set of integers starting from 53 | 0, i.e. they must be in the range [0, num_groups). 54 | batch_size (int): Size of mini-batch. 55 | """ 56 | def __init__(self, sampler, group_ids, batch_size): 57 | if not isinstance(sampler, Sampler): 58 | raise ValueError( 59 | "sampler should be an instance of " 60 | "torch.utils.data.Sampler, but got sampler={}".format(sampler) 61 | ) 62 | self.sampler = sampler 63 | self.group_ids = group_ids 64 | self.batch_size = batch_size 65 | 66 | def __iter__(self): 67 | buffer_per_group = defaultdict(list) 68 | samples_per_group = defaultdict(list) 69 | 70 | num_batches = 0 71 | for idx in self.sampler: 72 | group_id = self.group_ids[idx] 73 | buffer_per_group[group_id].append(idx) 74 | samples_per_group[group_id].append(idx) 75 | if len(buffer_per_group[group_id]) == self.batch_size: 76 | yield buffer_per_group[group_id] #TODO 77 | num_batches += 1 78 | del buffer_per_group[group_id] 79 | assert len(buffer_per_group[group_id]) < self.batch_size 80 | 81 | # now we have run out of elements that satisfy 82 | # the group criteria, let's return the remaining 83 | # elements so that the size of the sampler is 84 | # deterministic 85 | expected_num_batches = len(self) 86 | num_remaining = expected_num_batches - num_batches 87 | if num_remaining > 0: 88 | # for the remaining batches, group the batches by similar lengths 89 | batch_idx = [] 90 | for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]): 91 | batch_idx.extend(idxs) 92 | if len(batch_idx) >= self.batch_size: 93 | yield batch_idx[:self.batch_size] 94 | batch_idx = batch_idx[self.batch_size:] 95 | num_remaining -= 1 96 | if len(batch_idx) > 0: 97 | yield batch_idx 98 | num_remaining -= 1 99 | assert num_remaining == 0 100 | 101 | def __len__(self): 102 | """ 103 | Return the number of mini-batches rather than the number of samples. 104 | """ 105 | return (len(self.sampler) + self.batch_size - 1) // self.batch_size 106 | -------------------------------------------------------------------------------- /transformers/docs/source/converting_tensorflow_models.rst: -------------------------------------------------------------------------------- 1 | Converting Tensorflow Checkpoints 2 | ================================================ 3 | 4 | A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models than be loaded using the ``from_pretrained`` methods of the library. 5 | 6 | BERT 7 | ^^^^ 8 | 9 | You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google `_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py `_ script. 10 | 11 | This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py `_\ , `run_bert_classifier.py `_ and `run_bert_squad.py `_\ ). 12 | 13 | You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\ ``bert_config.json``\ ) and the vocabulary file (\ ``vocab.txt``\ ) as these are needed for the PyTorch model too. 14 | 15 | To run this specific conversion script you will need to have TensorFlow and PyTorch installed (\ ``pip install tensorflow``\ ). The rest of the repository only requires PyTorch. 16 | 17 | Here is an example of the conversion process for a pre-trained ``BERT-Base Uncased`` model: 18 | 19 | .. code-block:: shell 20 | 21 | export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12 22 | 23 | transformers bert \ 24 | $BERT_BASE_DIR/bert_model.ckpt \ 25 | $BERT_BASE_DIR/bert_config.json \ 26 | $BERT_BASE_DIR/pytorch_model.bin 27 | 28 | You can download Google's pre-trained models for the conversion `here `__. 29 | 30 | OpenAI GPT 31 | ^^^^^^^^^^ 32 | 33 | Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint save as the same format than OpenAI pretrained model (see `here `__\ ) 34 | 35 | .. code-block:: shell 36 | 37 | export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights 38 | 39 | transformers gpt \ 40 | $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \ 41 | $PYTORCH_DUMP_OUTPUT \ 42 | [OPENAI_GPT_CONFIG] 43 | 44 | OpenAI GPT-2 45 | ^^^^^^^^^^^^ 46 | 47 | Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here `__\ ) 48 | 49 | .. code-block:: shell 50 | 51 | export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights 52 | 53 | transformers gpt2 \ 54 | $OPENAI_GPT2_CHECKPOINT_PATH \ 55 | $PYTORCH_DUMP_OUTPUT \ 56 | [OPENAI_GPT2_CONFIG] 57 | 58 | Transformer-XL 59 | ^^^^^^^^^^^^^^ 60 | 61 | Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here `__\ ) 62 | 63 | .. code-block:: shell 64 | 65 | export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint 66 | 67 | transformers transfo_xl \ 68 | $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \ 69 | $PYTORCH_DUMP_OUTPUT \ 70 | [TRANSFO_XL_CONFIG] 71 | 72 | 73 | XLNet 74 | ^^^^^ 75 | 76 | Here is an example of the conversion process for a pre-trained XLNet model, fine-tuned on STS-B using the TensorFlow script: 77 | 78 | .. code-block:: shell 79 | 80 | export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint 81 | export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config 82 | 83 | transformers xlnet \ 84 | $TRANSFO_XL_CHECKPOINT_PATH \ 85 | $TRANSFO_XL_CONFIG_PATH \ 86 | $PYTORCH_DUMP_OUTPUT \ 87 | STS-B \ 88 | 89 | 90 | XLM 91 | ^^^ 92 | 93 | Here is an example of the conversion process for a pre-trained XLM model: 94 | 95 | .. code-block:: shell 96 | 97 | export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint 98 | 99 | transformers xlm \ 100 | $XLM_CHECKPOINT_PATH \ 101 | $PYTORCH_DUMP_OUTPUT \ 102 | -------------------------------------------------------------------------------- /transformers/examples/adversarial/utils_hans.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import copy 18 | import csv 19 | import json 20 | 21 | 22 | class InputExample(object): 23 | """ 24 | A single training/test example for simple sequence classification. 25 | 26 | Args: 27 | guid: Unique id for the example. 28 | text_a: string. The untokenized text of the first sequence. For single 29 | sequence tasks, only this sequence must be specified. 30 | text_b: (Optional) string. The untokenized text of the second sequence. 31 | Only must be specified for sequence pair tasks. 32 | label: (Optional) string. The label of the example. This should be 33 | specified for train and dev examples, but not for test examples. 34 | """ 35 | 36 | def __init__(self, guid, text_a, text_b=None, label=None, pairID=None): 37 | self.guid = guid 38 | self.text_a = text_a 39 | self.text_b = text_b 40 | self.label = label 41 | self.pairID = pairID 42 | 43 | def __repr__(self): 44 | return str(self.to_json_string()) 45 | 46 | def to_dict(self): 47 | """Serializes this instance to a Python dictionary.""" 48 | output = copy.deepcopy(self.__dict__) 49 | return output 50 | 51 | def to_json_string(self): 52 | """Serializes this instance to a JSON string.""" 53 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 54 | 55 | 56 | class InputFeatures(object): 57 | """ 58 | A single set of features of data. 59 | 60 | Args: 61 | input_ids: Indices of input sequence tokens in the vocabulary. 62 | attention_mask: Mask to avoid performing attention on padding token indices. 63 | Mask values selected in ``[0, 1]``: 64 | Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. 65 | token_type_ids: Segment token indices to indicate first and second portions of the inputs. 66 | label: Label corresponding to the input 67 | """ 68 | 69 | def __init__(self, input_ids, attention_mask, token_type_ids, label, pairID=None): 70 | self.input_ids = input_ids 71 | self.attention_mask = attention_mask 72 | self.token_type_ids = token_type_ids 73 | self.label = label 74 | self.pairID = pairID 75 | 76 | def __repr__(self): 77 | return str(self.to_json_string()) 78 | 79 | def to_dict(self): 80 | """Serializes this instance to a Python dictionary.""" 81 | output = copy.deepcopy(self.__dict__) 82 | return output 83 | 84 | def to_json_string(self): 85 | """Serializes this instance to a JSON string.""" 86 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 87 | 88 | 89 | class DataProcessor(object): 90 | """Base class for data converters for sequence classification data sets.""" 91 | 92 | def get_example_from_tensor_dict(self, tensor_dict): 93 | """Gets an example from a dict with tensorflow tensors 94 | 95 | Args: 96 | tensor_dict: Keys and values should match the corresponding Glue 97 | tensorflow_dataset examples. 98 | """ 99 | raise NotImplementedError() 100 | 101 | def get_train_examples(self, data_dir): 102 | """Gets a collection of `InputExample`s for the train set.""" 103 | raise NotImplementedError() 104 | 105 | def get_dev_examples(self, data_dir): 106 | """Gets a collection of `InputExample`s for the dev set.""" 107 | raise NotImplementedError() 108 | 109 | def get_labels(self): 110 | """Gets the list of labels for this data set.""" 111 | raise NotImplementedError() 112 | 113 | @classmethod 114 | def _read_tsv(cls, input_file, quotechar=None): 115 | """Reads a tab separated value file.""" 116 | with open(input_file, "r", encoding="utf-8-sig") as f: 117 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar) 118 | lines = [] 119 | for line in reader: 120 | lines.append(line) 121 | return lines 122 | -------------------------------------------------------------------------------- /transformers/examples/distillation/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Utils to train DistilBERT 16 | adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) 17 | """ 18 | import git 19 | import json 20 | import os 21 | import socket 22 | import torch 23 | import numpy as np 24 | 25 | import logging 26 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d - %(message)s', 27 | datefmt = '%m/%d/%Y %H:%M:%S', 28 | level = logging.INFO) 29 | logger = logging.getLogger(__name__) 30 | 31 | 32 | def git_log(folder_path: str): 33 | """ 34 | Log commit info. 35 | """ 36 | repo = git.Repo(search_parent_directories=True) 37 | repo_infos = { 38 | 'repo_id': str(repo), 39 | 'repo_sha': str(repo.head.object.hexsha), 40 | 'repo_branch': str(repo.active_branch) 41 | } 42 | 43 | with open(os.path.join(folder_path, 'git_log.json'), 'w') as f: 44 | json.dump(repo_infos, f, indent=4) 45 | 46 | 47 | def init_gpu_params(params): 48 | """ 49 | Handle single and multi-GPU / multi-node. 50 | """ 51 | if params.n_gpu <= 0: 52 | params.local_rank = 0 53 | params.master_port = -1 54 | params.is_master = True 55 | params.multi_gpu = False 56 | return 57 | 58 | assert torch.cuda.is_available() 59 | 60 | logger.info('Initializing GPUs') 61 | if params.n_gpu > 1: 62 | assert params.local_rank != -1 63 | 64 | params.world_size = int(os.environ['WORLD_SIZE']) 65 | params.n_gpu_per_node = int(os.environ['N_GPU_NODE']) 66 | params.global_rank = int(os.environ['RANK']) 67 | 68 | # number of nodes / node ID 69 | params.n_nodes = params.world_size // params.n_gpu_per_node 70 | params.node_id = params.global_rank // params.n_gpu_per_node 71 | params.multi_gpu = True 72 | 73 | assert params.n_nodes == int(os.environ['N_NODES']) 74 | assert params.node_id == int(os.environ['NODE_RANK']) 75 | 76 | # local job (single GPU) 77 | else: 78 | assert params.local_rank == -1 79 | 80 | params.n_nodes = 1 81 | params.node_id = 0 82 | params.local_rank = 0 83 | params.global_rank = 0 84 | params.world_size = 1 85 | params.n_gpu_per_node = 1 86 | params.multi_gpu = False 87 | 88 | # sanity checks 89 | assert params.n_nodes >= 1 90 | assert 0 <= params.node_id < params.n_nodes 91 | assert 0 <= params.local_rank <= params.global_rank < params.world_size 92 | assert params.world_size == params.n_nodes * params.n_gpu_per_node 93 | 94 | # define whether this is the master process / if we are in multi-node distributed mode 95 | params.is_master = params.node_id == 0 and params.local_rank == 0 96 | params.multi_node = params.n_nodes > 1 97 | 98 | # summary 99 | PREFIX = f"--- Global rank: {params.global_rank} - " 100 | logger.info(PREFIX + "Number of nodes: %i" % params.n_nodes) 101 | logger.info(PREFIX + "Node ID : %i" % params.node_id) 102 | logger.info(PREFIX + "Local rank : %i" % params.local_rank) 103 | logger.info(PREFIX + "World size : %i" % params.world_size) 104 | logger.info(PREFIX + "GPUs per node : %i" % params.n_gpu_per_node) 105 | logger.info(PREFIX + "Master : %s" % str(params.is_master)) 106 | logger.info(PREFIX + "Multi-node : %s" % str(params.multi_node)) 107 | logger.info(PREFIX + "Multi-GPU : %s" % str(params.multi_gpu)) 108 | logger.info(PREFIX + "Hostname : %s" % socket.gethostname()) 109 | 110 | # set GPU device 111 | torch.cuda.set_device(params.local_rank) 112 | 113 | # initialize multi-GPU 114 | if params.multi_gpu: 115 | logger.info("Initializing PyTorch distributed") 116 | torch.distributed.init_process_group( 117 | init_method='env://', 118 | backend='nccl', 119 | ) 120 | 121 | 122 | def set_seed(args): 123 | """ 124 | Set the random seed. 125 | """ 126 | np.random.seed(args.seed) 127 | torch.manual_seed(args.seed) 128 | if args.n_gpu > 0: 129 | torch.cuda.manual_seed_all(args.seed) 130 | -------------------------------------------------------------------------------- /transformers/transformers/data/processors/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import csv 18 | import sys 19 | import copy 20 | import json 21 | 22 | class InputExample(object): 23 | """ 24 | A single training/test example for simple sequence classification. 25 | 26 | Args: 27 | guid: Unique id for the example. 28 | text_a: string. The untokenized text of the first sequence. For single 29 | sequence tasks, only this sequence must be specified. 30 | text_b: (Optional) string. The untokenized text of the second sequence. 31 | Only must be specified for sequence pair tasks. 32 | label: (Optional) string. The label of the example. This should be 33 | specified for train and dev examples, but not for test examples. 34 | """ 35 | def __init__(self, guid, text_a, text_b=None, label=None): 36 | self.guid = guid 37 | self.text_a = text_a 38 | self.text_b = text_b 39 | self.label = label 40 | 41 | def __repr__(self): 42 | return str(self.to_json_string()) 43 | 44 | def to_dict(self): 45 | """Serializes this instance to a Python dictionary.""" 46 | output = copy.deepcopy(self.__dict__) 47 | return output 48 | 49 | def to_json_string(self): 50 | """Serializes this instance to a JSON string.""" 51 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 52 | 53 | 54 | class InputFeatures(object): 55 | """ 56 | A single set of features of data. 57 | 58 | Args: 59 | input_ids: Indices of input sequence tokens in the vocabulary. 60 | attention_mask: Mask to avoid performing attention on padding token indices. 61 | Mask values selected in ``[0, 1]``: 62 | Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. 63 | token_type_ids: Segment token indices to indicate first and second portions of the inputs. 64 | label: Label corresponding to the input 65 | """ 66 | 67 | def __init__(self, input_ids, attention_mask, token_type_ids, label): 68 | self.input_ids = input_ids 69 | self.attention_mask = attention_mask 70 | self.token_type_ids = token_type_ids 71 | self.label = label 72 | 73 | def __repr__(self): 74 | return str(self.to_json_string()) 75 | 76 | def to_dict(self): 77 | """Serializes this instance to a Python dictionary.""" 78 | output = copy.deepcopy(self.__dict__) 79 | return output 80 | 81 | def to_json_string(self): 82 | """Serializes this instance to a JSON string.""" 83 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 84 | 85 | 86 | class DataProcessor(object): 87 | """Base class for data converters for sequence classification data sets.""" 88 | 89 | def get_example_from_tensor_dict(self, tensor_dict): 90 | """Gets an example from a dict with tensorflow tensors 91 | 92 | Args: 93 | tensor_dict: Keys and values should match the corresponding Glue 94 | tensorflow_dataset examples. 95 | """ 96 | raise NotImplementedError() 97 | 98 | def get_train_examples(self, data_dir): 99 | """Gets a collection of `InputExample`s for the train set.""" 100 | raise NotImplementedError() 101 | 102 | def get_dev_examples(self, data_dir): 103 | """Gets a collection of `InputExample`s for the dev set.""" 104 | raise NotImplementedError() 105 | 106 | def get_labels(self): 107 | """Gets the list of labels for this data set.""" 108 | raise NotImplementedError() 109 | 110 | @classmethod 111 | def _read_tsv(cls, input_file, quotechar=None): 112 | """Reads a tab separated value file.""" 113 | with open(input_file, "r", encoding="utf-8-sig") as f: 114 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar) 115 | lines = [] 116 | for line in reader: 117 | if sys.version_info[0] == 2: 118 | line = list(unicode(cell, 'utf-8') for cell in line) 119 | lines.append(line) 120 | return lines 121 | -------------------------------------------------------------------------------- /transformers/examples/parallel.py: -------------------------------------------------------------------------------- 1 | from typing import List, Callable 2 | from functools import partial 3 | from concurrent import futures 4 | from contextlib import contextmanager 5 | from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor 6 | from collections import OrderedDict 7 | @contextmanager 8 | def NoPoolExecutor(): 9 | """ 10 | Provides the same interface as ThreadPoolExecutor and ProcessPoolExecutor, 11 | but does not do any threading / multiprocessing. 12 | Submitting to NoPoolExecutor returns a function wrapper that acts like a Future, 13 | but the function is executed synchronously when .result() is called. 14 | """ 15 | class NoPoolExecutorInner: 16 | class NoFuture: 17 | def __init__(self, fn, **kwargs): 18 | self.fn = partial(fn, **kwargs) 19 | def result(self): 20 | return self.fn() 21 | def submit(self, fn, **kwargs): 22 | return NoPoolExecutorInner.NoFuture(fn, **kwargs) 23 | def shutdown(self, *args, **kwargs): 24 | pass 25 | yield NoPoolExecutorInner() 26 | def parallelized_iterator( 27 | fn: Callable, 28 | kwargs_list: List[dict], 29 | tags_list: List[object] = None, 30 | scheduler: str = "threads", 31 | max_workers: int = 0, 32 | progress: bool = True, 33 | progress_auto: bool = True, 34 | progress_desc: Callable[[int], str] = None, 35 | progress_position: int = 0, 36 | keep_order: bool = False, 37 | cleanup: Callable[[ThreadPoolExecutor], None] = None 38 | ): 39 | """ 40 | Args: 41 | fn: function to execute 42 | kwargs_list: arguments for each future call 43 | tags_list: return argument associated to each future 44 | scheduler: backend for threading. Defaults to "threads". 45 | max_workers: max workers. 46 | progress: show progress. 47 | progress_auto: show progress with auto. 48 | progress_desc: function that produces a description. 49 | keep_order: return results in order of kwargs 50 | cleanup: function that handles exception cleanup. 51 | Yields: 52 | for each kwargs in kwargs_list: 53 | kwargs, fn(**kwargs) 54 | """ 55 | if not kwargs_list: 56 | return [] 57 | if not isinstance(kwargs_list[0], dict): 58 | raise ValueError("kwargs_list elements must be a dict.") 59 | if cleanup is None: 60 | def default_cleanup(executor): 61 | executor.shutdown(wait=True) 62 | cleanup = default_cleanup 63 | if progress_auto: 64 | from tqdm.auto import tqdm 65 | else: 66 | from tqdm import tqdm 67 | if tags_list is None: 68 | tags_list = kwargs_list 69 | if len(tags_list) != len(kwargs_list): 70 | raise ValueError( 71 | "Number of tags should match the number of jobs to parallelize.") 72 | ordered = lambda x: x # noqa: E731 73 | if max_workers <= 1: 74 | pool_executor_cls = NoPoolExecutor 75 | executor_kwargs = dict() 76 | as_completed_fn = ordered 77 | elif scheduler == "processes": 78 | pool_executor_cls = ProcessPoolExecutor 79 | executor_kwargs = dict(max_workers=max_workers) 80 | as_completed_fn = ordered if keep_order else futures.as_completed 81 | elif scheduler == "threads": 82 | pool_executor_cls = ThreadPoolExecutor 83 | executor_kwargs = dict(max_workers=max_workers) 84 | as_completed_fn = ordered if keep_order else futures.as_completed 85 | else: 86 | raise ValueError("Wrong scheduler: %s" % scheduler) 87 | with pool_executor_cls(**executor_kwargs) as executor: 88 | futures_dict = OrderedDict( 89 | (executor.submit(fn, **kwargs), tag) 90 | for kwargs, tag in zip(kwargs_list, tags_list)) 91 | tqdm_args = {} 92 | tqdm_args["total"] = len(kwargs_list) 93 | tqdm_args["disable"] = not progress 94 | tqdm_args["position"] = progress_position 95 | with tqdm(**tqdm_args) as bar: 96 | try: 97 | if progress_desc is not None: 98 | bar.set_description(progress_desc(0)) 99 | for i, future in enumerate(as_completed_fn(futures_dict)): 100 | bar.update(1) 101 | if progress_desc is not None: 102 | # strip a right '.' because it doesn't look good with tqdm 103 | bar.set_description(progress_desc(i + 1).rstrip('.')) 104 | tag = futures_dict[future] 105 | yield tag, future.result() 106 | finally: 107 | cleanup(executor) 108 | def parallelized( 109 | fn, 110 | kwargs_list: List[dict], 111 | tags_list: List[object] = None, 112 | scheduler: str = "threads", 113 | max_workers: int = 0, 114 | progress: bool = True, 115 | progress_auto: bool = True, 116 | progress_desc: Callable[[int], str] = None, 117 | progress_position: int = 0, 118 | keep_order: bool = True, 119 | cleanup: Callable[[ThreadPoolExecutor], None] = None, 120 | ): 121 | results = [] 122 | for _, result in parallelized_iterator( 123 | fn, kwargs_list, tags_list, scheduler, max_workers, 124 | progress, progress_auto, progress_desc, 125 | progress_position, keep_order, cleanup): 126 | results.append(result) 127 | return results 128 | -------------------------------------------------------------------------------- /transformers/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" 17 | 18 | import os 19 | import argparse 20 | import torch 21 | import numpy as np 22 | import tensorflow as tf 23 | from transformers import BertModel 24 | 25 | 26 | def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str): 27 | 28 | """ 29 | :param model:BertModel Pytorch model instance to be converted 30 | :param ckpt_dir: Tensorflow model directory 31 | :param model_name: model name 32 | :return: 33 | 34 | Currently supported HF models: 35 | Y BertModel 36 | N BertForMaskedLM 37 | N BertForPreTraining 38 | N BertForMultipleChoice 39 | N BertForNextSentencePrediction 40 | N BertForSequenceClassification 41 | N BertForQuestionAnswering 42 | """ 43 | 44 | tensors_to_transpose = ( 45 | "dense.weight", 46 | "attention.self.query", 47 | "attention.self.key", 48 | "attention.self.value" 49 | ) 50 | 51 | var_map = ( 52 | ('layer.', 'layer_'), 53 | ('word_embeddings.weight', 'word_embeddings'), 54 | ('position_embeddings.weight', 'position_embeddings'), 55 | ('token_type_embeddings.weight', 'token_type_embeddings'), 56 | ('.', '/'), 57 | ('LayerNorm/weight', 'LayerNorm/gamma'), 58 | ('LayerNorm/bias', 'LayerNorm/beta'), 59 | ('weight', 'kernel') 60 | ) 61 | 62 | if not os.path.isdir(ckpt_dir): 63 | os.makedirs(ckpt_dir) 64 | 65 | state_dict = model.state_dict() 66 | 67 | def to_tf_var_name(name:str): 68 | for patt, repl in iter(var_map): 69 | name = name.replace(patt, repl) 70 | return 'bert/{}'.format(name) 71 | 72 | def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session): 73 | tf_dtype = tf.dtypes.as_dtype(tensor.dtype) 74 | tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) 75 | session.run(tf.variables_initializer([tf_var])) 76 | session.run(tf_var) 77 | return tf_var 78 | 79 | tf.reset_default_graph() 80 | with tf.Session() as session: 81 | for var_name in state_dict: 82 | tf_name = to_tf_var_name(var_name) 83 | torch_tensor = state_dict[var_name].numpy() 84 | if any([x in var_name for x in tensors_to_transpose]): 85 | torch_tensor = torch_tensor.T 86 | tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) 87 | tf.keras.backend.set_value(tf_var, torch_tensor) 88 | tf_weight = session.run(tf_var) 89 | print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor))) 90 | 91 | saver = tf.train.Saver(tf.trainable_variables()) 92 | saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) 93 | 94 | 95 | def main(raw_args=None): 96 | parser = argparse.ArgumentParser() 97 | parser.add_argument("--model_name", 98 | type=str, 99 | required=True, 100 | help="model name e.g. bert-base-uncased") 101 | parser.add_argument("--cache_dir", 102 | type=str, 103 | default=None, 104 | required=False, 105 | help="Directory containing pytorch model") 106 | parser.add_argument("--pytorch_model_path", 107 | type=str, 108 | required=True, 109 | help="/path/to/.bin") 110 | parser.add_argument("--tf_cache_dir", 111 | type=str, 112 | required=True, 113 | help="Directory in which to save tensorflow model") 114 | args = parser.parse_args(raw_args) 115 | 116 | model = BertModel.from_pretrained( 117 | pretrained_model_name_or_path=args.model_name, 118 | state_dict=torch.load(args.pytorch_model_path), 119 | cache_dir=args.cache_dir 120 | ) 121 | 122 | convert_pytorch_checkpoint_to_tf( 123 | model=model, 124 | ckpt_dir=args.tf_cache_dir, 125 | model_name=args.model_name 126 | ) 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | --------------------------------------------------------------------------------