├── dialogue_generation
    ├── pycocoevalcap
    │   ├── __init__.py
    │   ├── cider
    │   │   ├── __init__.py
    │   │   └── cider.py
    │   ├── meteor
    │   │   ├── __init__.py
    │   │   └── meteor.py
    │   ├── rouge
    │   │   ├── __init__.py
    │   │   └── rouge.py
    │   ├── bleu
    │   │   ├── __init__.py
    │   │   ├── LICENSE
    │   │   └── bleu.py
    │   ├── tokenizer
    │   │   ├── __init__.py
    │   │   └── ptbtokenizer.py
    │   └── eval.py
    ├── requirements.txt
    ├── test_special_tokens.py
    ├── eval_utils.py
    └── utils.py
├── language_modeling
    ├── pytorch_transformers
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── fixtures
    │   │   │   ├── input.txt
    │   │   │   ├── test_sentencepiece.model
    │   │   │   └── sample_text.txt
    │   │   ├── conftest.py
    │   │   ├── tokenization_dilbert_test.py
    │   │   ├── tokenization_auto_test.py
    │   │   ├── tokenization_utils_test.py
    │   │   ├── configuration_common_test.py
    │   │   ├── tokenization_openai_test.py
    │   │   ├── tokenization_transfo_xl_test.py
    │   │   ├── tokenization_gpt2_test.py
    │   │   ├── tokenization_xlm_test.py
    │   │   ├── modeling_auto_test.py
    │   │   ├── tokenization_roberta_test.py
    │   │   ├── tokenization_xlnet_test.py
    │   │   ├── tokenization_bert_test.py
    │   │   └── optimization_test.py
    │   ├── tokenization_bart.py
    │   ├── configuration_roberta.py
    │   ├── convert_tf_checkpoint_to_pytorch.py
    │   ├── convert_xlm_checkpoint_to_pytorch.py
    │   ├── convert_gpt2_checkpoint_to_pytorch.py
    │   ├── convert_openai_checkpoint_to_pytorch.py
    │   ├── configuration_distilbert.py
    │   ├── tokenization_distilbert.py
    │   ├── __init__.py
    │   ├── convert_xlnet_checkpoint_to_pytorch.py
    │   ├── convert_pytorch_checkpoint_to_tf.py
    │   ├── configuration_openai.py
    │   ├── tokenization_flaubert.py
    │   ├── convert_transfo_xl_checkpoint_to_pytorch.py
    │   ├── configuration_gpt2.py
    │   ├── configuration_bert.py
    │   └── __main__.py
    ├── requirements.txt
    ├── examples
    │   ├── requirements.txt
    │   ├── tests_samples
    │   │   ├── .gitignore
    │   │   └── MRPC
    │   │   │   ├── dev.tsv
    │   │   │   └── train.tsv
    │   ├── distillation
    │   │   ├── requirements.txt
    │   │   ├── scripts
    │   │   │   ├── token_counts.py
    │   │   │   ├── binarized_data.py
    │   │   │   └── extract_for_distil.py
    │   │   ├── utils.py
    │   │   └── README.md
    │   ├── get_epsilon_perplexity
    │   │   └── eps_ppl.py
    │   ├── test_examples.py
    │   ├── lm_finetuning
    │   │   └── README.md
    │   └── single_model_scripts
    │   │   └── run_transfo_xl.py
    ├── pytorch_transformers.egg-info
    │   ├── dependency_links.txt
    │   ├── top_level.txt
    │   ├── entry_points.txt
    │   ├── requires.txt
    │   └── SOURCES.txt
    ├── docker
    │   └── Dockerfile
    ├── .github
    │   ├── ISSUE_TEMPLATE
    │   │   ├── question-help.md
    │   │   ├── feature-request.md
    │   │   ├── bug-report.md
    │   │   └── migration.md
    │   └── stale.yml
    ├── .coveragerc
    ├── .circleci
    │   └── config.yml
    ├── setup.py
    └── hubconf.py
└── README.md


/dialogue_generation/pycocoevalcap/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dialogue_generation/pycocoevalcap/cider/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dialogue_generation/pycocoevalcap/meteor/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dialogue_generation/pycocoevalcap/rouge/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dialogue_generation/pycocoevalcap/bleu/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/dialogue_generation/pycocoevalcap/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/language_modeling/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.0.0
2 | entmax>=1.0
3 | 


--------------------------------------------------------------------------------
/language_modeling/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboardX
2 | scikit-learn


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | pytorch_transformers
2 | 


--------------------------------------------------------------------------------
/language_modeling/examples/tests_samples/.gitignore:
--------------------------------------------------------------------------------
1 | *.*
2 | cache*
3 | temp*
4 | !*.tsv
5 | !*.json
6 | !.gitignore


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/fixtures/input.txt:
--------------------------------------------------------------------------------
1 | Who was Jim Henson ? ||| Jim Henson was a puppeteer
2 | 


--------------------------------------------------------------------------------
/language_modeling/examples/distillation/requirements.txt:
--------------------------------------------------------------------------------
1 | gitpython==3.0.2
2 | tensorboard>=1.14.0
3 | tensorboardX==1.8
4 | psutil>=5.6.6
5 | 


--------------------------------------------------------------------------------
/dialogue_generation/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | pytorch-ignite
3 | pytorch-transformers>=1.2
4 | tensorboardX==1.8
5 | tensorflow  # for tensorboardX
6 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | pytorch_transformers = pytorch_transformers.__main__:main
3 | 
4 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | torch>=1.0.0
2 | numpy
3 | boto3
4 | requests
5 | tqdm
6 | regex
7 | sentencepiece
8 | sacremoses
9 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/fixtures/test_sentencepiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/sparse_text_generation/HEAD/language_modeling/pytorch_transformers/tests/fixtures/test_sentencepiece.model


--------------------------------------------------------------------------------
/language_modeling/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:latest
2 | 
3 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
4 | 
5 | RUN pip install pytorch_transformers
6 | 
7 | WORKDIR /workspace


--------------------------------------------------------------------------------
/language_modeling/.github/ISSUE_TEMPLATE/question-help.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: "❓Questions & Help"
3 | about: Start a general discussion related to PyTorch Transformers
4 | ---
5 | 
6 | ## ❓ Questions & Help
7 | 
8 | <!-- A clear and concise description of the question. -->


--------------------------------------------------------------------------------
/language_modeling/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source=pytorch_transformers
 3 | omit =
 4 |     # skip convertion scripts from testing for now
 5 |     */convert_*
 6 |     */__main__.py
 7 | [report]
 8 | exclude_lines =
 9 |     pragma: no cover
10 |     raise
11 |     except
12 |     register_parameter


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # content of conftest.py
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | def pytest_addoption(parser):
 7 |     parser.addoption(
 8 |         "--runslow", action="store_true", default=False, help="run slow tests"
 9 |     )
10 | 
11 | 
12 | def pytest_collection_modifyitems(config, items):
13 |     if config.getoption("--runslow"):
14 |         # --runslow given in cli: do not skip slow tests
15 |         return
16 |     skip_slow = pytest.mark.skip(reason="need --runslow option to run")
17 |     for item in items:
18 |         if "slow" in item.keywords:
19 |             item.add_marker(skip_slow)
20 | 


--------------------------------------------------------------------------------
/language_modeling/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F680 Feature Request"
 3 | about: Submit a proposal/request for a new PyTorch Transformers feature
 4 | ---
 5 | 
 6 | ## 🚀 Feature
 7 | 
 8 | <!-- A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist. -->
 9 | 
10 | ## Motivation
11 | 
12 | <!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. -->
13 | 
14 | ## Additional context
15 | 
16 | <!-- Add any other context or screenshots about the feature request here. -->


--------------------------------------------------------------------------------
/language_modeling/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 60
 3 | # Number of days of inactivity before a stale issue is closed
 4 | daysUntilClose: 7
 5 | # Issues with these labels will never be considered stale
 6 | exemptLabels:
 7 |   - pinned
 8 |   - security
 9 | # Label to use when marking an issue as stale
10 | staleLabel: wontfix
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 |   This issue has been automatically marked as stale because it has not had
14 |   recent activity. It will be closed if no further activity occurs. Thank you
15 |   for your contributions.
16 | # Comment to post when closing a stale issue. Set to `false` to disable
17 | closeComment: false


--------------------------------------------------------------------------------
/dialogue_generation/pycocoevalcap/bleu/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/language_modeling/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F41B Bug Report"
 3 | about: Submit a bug report to help us improve PyTorch Transformers
 4 | ---
 5 | 
 6 | ## 🐛 Bug
 7 | 
 8 | <!-- Important information -->
 9 | 
10 | Model I am using (Bert, XLNet....):
11 | 
12 | Language I am using the model on (English, Chinese....):
13 | 
14 | The problem arise when using:
15 | * [ ] the official example scripts: (give details)
16 | * [ ] my own modified scripts: (give details)
17 | 
18 | The tasks I am working on is:
19 | * [ ] an official GLUE/SQUaD task: (give the name)
20 | * [ ] my own task or dataset: (give details)
21 | 
22 | ## To Reproduce
23 | 
24 | Steps to reproduce the behavior:
25 | 
26 | 1.
27 | 2.
28 | 3.
29 | 
30 | <!-- If you have a code sample, error messages, stack traces, please provide it here as well. -->
31 | 
32 | ## Expected behavior
33 | 
34 | <!-- A clear and concise description of what you expected to happen. -->
35 | 
36 | ## Environment
37 | 
38 | * OS:
39 | * Python version:
40 | * PyTorch version:
41 | * PyTorch Transformers version (or branch):
42 | * Using GPU ?
43 | * Distributed of parallel setup ?
44 | * Any other relevant information:
45 | 
46 | ## Additional context
47 | 
48 | <!-- Add any other context about the problem here. -->


--------------------------------------------------------------------------------
/language_modeling/examples/get_epsilon_perplexity/eps_ppl.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import pdb
 4 | import pickle
 5 | 
 6 | with open('p_target', 'rb') as f:
 7 |     p_ = pickle.load(f)
 8 | 
 9 | 
10 | def find_nearest(array, value):
11 |     array = np.asarray(array)
12 |     idx = (np.abs(array - value)).argmin()
13 |     return array[idx], idx
14 | 
15 | p=np.array(p_)
16 | V = 50257
17 | n_sents = len(p)
18 | 
19 | lambdas = np.linspace(0, .7, 10000)
20 | losses = np.zeros_like(lambdas)
21 | 
22 | for i in range(len(lambdas)):
23 |     lambda_ = lambdas[i]
24 |     # Assume groundtruth is 0.
25 |     loss = -np.sum(np.log((1-lambda_)*p + lambda_/V)) / n_sents
26 |     losses[i] = loss
27 | 
28 | lambda_ = .5
29 | eta = .00001
30 | c = np.ones(n_sents)/n_sents
31 | r = p / (1./V - p)
32 | for iter in range(100000):
33 |     lambda_ = lambda_ + eta* (c/(lambda_ + r)).sum()
34 |     if lambda_ < 0.:
35 |         lambda_ = 0.
36 |     elif lambda_ > 1.:
37 |         lambda_ = 1.
38 |     print(lambda_)
39 | 
40 | 
41 | eps = lambda_/(V *(1-lambda_))
42 | 
43 | lambda_, idx= find_nearest(lambdas, lambda_)
44 | 
45 | 
46 | print('perplexity:', np.exp(losses[idx]))
47 | 
48 | print('optimal perplexity:', eps)
49 | 
50 | 
51 | plt.plot(lambdas, losses)
52 | plt.show()


--------------------------------------------------------------------------------
/language_modeling/.github/ISSUE_TEMPLATE/migration.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
 3 | about: Report a problem when migrating from PyTorch-pretrained-Bert to PyTorch-Transformers
 4 | ---
 5 | 
 6 | ## 📚 Migration
 7 | 
 8 | <!-- Important information -->
 9 | 
10 | Model I am using (Bert, XLNet....):
11 | 
12 | Language I am using the model on (English, Chinese....):
13 | 
14 | The problem arise when using:
15 | * [ ] the official example scripts: (give details)
16 | * [ ] my own modified scripts: (give details)
17 | 
18 | The tasks I am working on is:
19 | * [ ] an official GLUE/SQUaD task: (give the name)
20 | * [ ] my own task or dataset: (give details)
21 | 
22 | Details of the issue:
23 | 
24 | <!-- A clear and concise description of the migration issue. If you have code snippets, please provide it here as well. -->
25 | 
26 | ## Environment
27 | 
28 | * OS:
29 | * Python version:
30 | * PyTorch version:
31 | * PyTorch Transformers version (or branch):
32 | * Using GPU ?
33 | * Distributed of parallel setup ?
34 | * Any other relevant information:
35 | 
36 | ## Checklist
37 | 
38 | - [ ] I have read the migration guide in the readme.
39 | - [ ] I checked if a related official extension example runs on my machine.
40 | 
41 | ## Additional context
42 | 
43 | <!-- Add any other context about the problem here. -->


--------------------------------------------------------------------------------
/language_modeling/examples/tests_samples/MRPC/dev.tsv:
--------------------------------------------------------------------------------
1 | Quality	#1 ID	#2 ID	#1 String	#2 String
2 | 1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
3 | 0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
4 | 0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
5 | 1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
6 | 0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
7 | 1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
8 | 


--------------------------------------------------------------------------------
/language_modeling/examples/tests_samples/MRPC/train.tsv:
--------------------------------------------------------------------------------
1 | Quality	#1 ID	#2 ID	#1 String	#2 String
2 | 1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
3 | 0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
4 | 0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
5 | 1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
6 | 0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
7 | 1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
8 | 


--------------------------------------------------------------------------------
/dialogue_generation/pycocoevalcap/bleu/bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # 
 3 | # File Name : bleu.py
 4 | #
 5 | # Description : Wrapper for BLEU scorer.
 6 | #
 7 | # Creation Date : 06-01-2015
 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 9 | 
10 | from .bleu_scorer import BleuScorer
11 | 
12 | 
13 | class Bleu:
14 |     def __init__(self, n=4):
15 |         # default compute Blue score up to 4
16 |         self._n = n
17 |         self._hypo_for_image = {}
18 |         self.ref_for_image = {}
19 | 
20 |     def compute_score(self, gts, res):
21 | 
22 |         assert(gts.keys() == res.keys())
23 |         imgIds = gts.keys()
24 | 
25 |         bleu_scorer = BleuScorer(n=self._n)
26 |         for id in imgIds:
27 |             hypo = res[id]
28 |             ref = gts[id]
29 | 
30 |             # Sanity check.
31 |             assert(type(hypo) is list)
32 |             assert(len(hypo) == 1)
33 |             assert(type(ref) is list)
34 |             #print(ref)
35 |             #assert(len(ref) > 1)
36 | 
37 |             bleu_scorer += (hypo[0], ref)
38 | 
39 |         #score, scores = bleu_scorer.compute_score(option='shortest')
40 |         score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
41 |         #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
42 | 
43 |         # return (bleu, bleu_info)
44 |         return score, scores
45 | 
46 |     def method(self):
47 |         return "Bleu"
48 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tokenization_bart.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from .tokenization_roberta import RobertaTokenizer
17 | 
18 | 
19 | # vocab and merges same as roberta
20 | vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
21 | merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
22 | _all_bart_models = [
23 |     "bart-large",
24 |     "bart-large-mnli",
25 |     # "bart-large-cnn"
26 | ]
27 | 
28 | 
29 | class BartTokenizer(RobertaTokenizer):
30 |     # merges and vocab same as Roberta
31 |     max_model_input_sizes = {m: 1024 for m in _all_bart_models}
32 |     pretrained_vocab_files_map = {
33 |         "vocab_file": {m: vocab_url for m in _all_bart_models},
34 |         "merges_file": {m: merges_url for m in _all_bart_models},
35 |     }
36 | 


--------------------------------------------------------------------------------
/dialogue_generation/test_special_tokens.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import shutil
 3 | import unittest
 4 | 
 5 | from pytorch_transformers import OpenAIGPTTokenizer, GPT2Tokenizer
 6 | from train import ATTR_TO_SPECIAL_TOKEN, SPECIAL_TOKENS
 7 | 
 8 | class TestSpecialTokenTreatment(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 |         self.save_dir = Path('utest_save_dir')
12 |         self.save_dir.mkdir(exist_ok=True)
13 | 
14 |     def tearDown(self):
15 |         shutil.rmtree(self.save_dir)
16 | 
17 |     def test_special_tokens_checkpoint_behavior(self):
18 |         toks = [OpenAIGPTTokenizer.from_pretrained('openai-gpt'), GPT2Tokenizer.from_pretrained('gpt2')]
19 |         for tok in toks:
20 |             self.assertEqual(len(tok.added_tokens_encoder), 0)
21 |             tok.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
22 |             self.assertEqual(len(tok.added_tokens_encoder), 5)
23 |             # Make sure we never split
24 |             self.assertEqual(len(tok.tokenize("<bos> <speaker1>")), 2)
25 |             ids = tok.convert_tokens_to_ids(SPECIAL_TOKENS)
26 |             self.assertTrue(all([x > 0 for x in ids]),
27 |                             f'some tokens failed to tokenize {SPECIAL_TOKENS} -> {ids}')
28 |             # Need to mantain indices through save. (this is also tested in pytorch-transformers)
29 |             tok.save_pretrained(self.save_dir)
30 |             tok_loaded = tok.from_pretrained(str(self.save_dir))
31 |             ids2 = tok_loaded.convert_tokens_to_ids(SPECIAL_TOKENS)
32 |             self.assertListEqual(ids, ids2)
33 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/configuration_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 | 
18 | from __future__ import (absolute_import, division, print_function,
19 |                         unicode_literals)
20 | 
21 | import logging
22 | 
23 | from .configuration_bert import BertConfig
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28 |     'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
29 |     'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
30 |     'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
31 | }
32 | 
33 | 
34 | class RobertaConfig(BertConfig):
35 |     pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
36 | 


--------------------------------------------------------------------------------
/dialogue_generation/pycocoevalcap/cider/cider.py:
--------------------------------------------------------------------------------
 1 | # Filename: cider.py
 2 | #
 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 
 4 | #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 5 | #
 6 | # Creation Date: Sun Feb  8 14:16:54 2015
 7 | #
 8 | 
 9 | from cider_scorer import CiderScorer
10 | import pdb
11 | 
12 | class Cider:
13 |     """
14 |     Main Class to compute the CIDEr metric 
15 | 
16 |     """
17 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
18 |         # set cider to sum over 1 to 4-grams
19 |         self._n = n
20 |         # set the standard deviation parameter for gaussian penalty
21 |         self._sigma = sigma
22 | 
23 |     def compute_score(self, gts, res):
24 |         """
25 |         Main function to compute CIDEr score
26 |         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
27 |                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
28 |         :return: cider (float) : computed CIDEr score for the corpus 
29 |         """
30 | 
31 |         assert(gts.keys() == res.keys())
32 |         imgIds = gts.keys()
33 | 
34 |         cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
35 | 
36 |         for id in imgIds:
37 |             hypo = res[id]
38 |             ref = gts[id]
39 | 
40 |             # Sanity check.
41 |             assert(type(hypo) is list)
42 |             assert(len(hypo) == 1)
43 |             assert(type(ref) is list)
44 |             assert(len(ref) > 0)
45 | 
46 |             cider_scorer += (hypo[0], ref)
47 | 
48 |         (score, scores) = cider_scorer.compute_score()
49 | 
50 |         return score, scores
51 | 
52 |     def method(self):
53 |         return "CIDEr"


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/tokenization_dilbert_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | from io import open
20 | 
21 | from pytorch_transformers.tokenization_distilbert import (DistilBertTokenizer)
22 | 
23 | from .tokenization_tests_commons import CommonTestCases
24 | from .tokenization_bert_test import BertTokenizationTest
25 | 
26 | class DistilBertTokenizationTest(BertTokenizationTest):
27 | 
28 |     tokenizer_class = DistilBertTokenizer
29 | 
30 |     def get_tokenizer(self, **kwargs):
31 |         return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
32 | 
33 |     def test_sequence_builders(self):
34 |         tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
35 | 
36 |         text = tokenizer.encode("sequence builders")
37 |         text_2 = tokenizer.encode("multi-sequence build")
38 | 
39 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
40 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
41 | 
42 |         assert encoded_sentence == [101] + text + [102]
43 |         assert encoded_pair == [101] + text + [102] + text_2 + [102]
44 | 
45 | if __name__ == '__main__':
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/tokenization_auto_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import unittest
20 | import shutil
21 | import pytest
22 | import logging
23 | 
24 | from pytorch_transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
25 | from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
26 | from pytorch_transformers.modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
27 | 
28 | 
29 | class AutoTokenizerTest(unittest.TestCase):
30 |     def test_tokenizer_from_pretrained(self):
31 |         logging.basicConfig(level=logging.INFO)
32 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
33 |             tokenizer = AutoTokenizer.from_pretrained(model_name)
34 |             self.assertIsNotNone(tokenizer)
35 |             self.assertIsInstance(tokenizer, BertTokenizer)
36 |             self.assertGreater(len(tokenizer), 0)
37 | 
38 |         for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
39 |             tokenizer = AutoTokenizer.from_pretrained(model_name)
40 |             self.assertIsNotNone(tokenizer)
41 |             self.assertIsInstance(tokenizer, GPT2Tokenizer)
42 |             self.assertGreater(len(tokenizer), 0)
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/tokenization_utils_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 HuggingFace Inc..
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import unittest
20 | import six
21 | 
22 | from pytorch_transformers import PreTrainedTokenizer
23 | from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
24 | 
25 | class TokenizerUtilsTest(unittest.TestCase):
26 |     def check_tokenizer_from_pretrained(self, tokenizer_class):
27 |         s3_models = list(tokenizer_class.max_model_input_sizes.keys())
28 |         for model_name in s3_models[:1]:
29 |             tokenizer = tokenizer_class.from_pretrained(model_name)
30 |             self.assertIsNotNone(tokenizer)
31 |             self.assertIsInstance(tokenizer, tokenizer_class)
32 |             self.assertIsInstance(tokenizer, PreTrainedTokenizer)
33 | 
34 |             for special_tok in tokenizer.all_special_tokens:
35 |                 if six.PY2:
36 |                     self.assertIsInstance(special_tok, unicode)
37 |                 else:
38 |                     self.assertIsInstance(special_tok, str)
39 |                 special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
40 |                 self.assertIsInstance(special_tok_id, int)
41 | 
42 |     def test_pretrained_tokenizers(self):
43 |         self.check_tokenizer_from_pretrained(GPT2Tokenizer)
44 | 
45 | if __name__ == "__main__":
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/language_modeling/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | jobs:
 3 |     build_py3:
 4 |         working_directory: ~/pytorch-transformers
 5 |         docker:
 6 |             - image: circleci/python:3.5
 7 |         resource_class: xlarge
 8 |         parallelism: 1
 9 |         steps:
10 |             - checkout
11 |             - run: sudo pip install --progress-bar off .
12 |             - run: sudo pip install pytest codecov pytest-cov
13 |             - run: sudo pip install tensorboardX scikit-learn
14 |             - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
15 |             - run: python -m pytest -sv ./examples/
16 |             - run: codecov
17 |     build_py2:
18 |         working_directory: ~/pytorch-transformers
19 |         resource_class: large
20 |         parallelism: 1
21 |         docker:
22 |             - image: circleci/python:2.7
23 |         steps:
24 |             - checkout
25 |             - run: sudo pip install --progress-bar off .
26 |             - run: sudo pip install pytest codecov pytest-cov
27 |             - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
28 |             - run: codecov
29 |     deploy_doc:
30 |         working_directory: ~/pytorch-transformers
31 |         docker:
32 |             - image: circleci/python:3.5
33 |         steps:
34 |             - add_ssh_keys:
35 |                   fingerprints:
36 |                       - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
37 |             - checkout
38 |             - run: sudo pip install --progress-bar off -r docs/requirements.txt
39 |             - run: sudo pip install --progress-bar off -r requirements.txt
40 |             - run: cd docs/source && ln -s ../../examples/README.md examples.md && cd -
41 |             - run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
42 | workflow_filters: &workflow_filters
43 |     filters:
44 |         branches:
45 |             only:
46 |                 - master
47 | workflows:
48 |     version: 2
49 |     build_and_test:
50 |         jobs:
51 |             - build_py3
52 |             - build_py2
53 |             - deploy_doc: *workflow_filters


--------------------------------------------------------------------------------
/language_modeling/examples/distillation/scripts/token_counts.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """
16 | Preprocessing script before training DistilBERT.
17 | """
18 | from collections import Counter
19 | import argparse
20 | import pickle
21 | import logging
22 | 
23 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
24 |                     datefmt = '%m/%d/%Y %H:%M:%S',
25 |                     level = logging.INFO)
26 | logger = logging.getLogger(__name__)
27 | 
28 | if __name__ == '__main__':
29 |     parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)")
30 |     parser.add_argument("--data_file", type=str, default="data/dump.bert-base-uncased.pickle",
31 |                         help="The binarized dataset.")
32 |     parser.add_argument("--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle",
33 |                         help="The dump file.")
34 |     parser.add_argument("--vocab_size", default=30522, type=int)
35 |     args = parser.parse_args()
36 | 
37 |     logger.info(f'Loading data from {args.data_file}')
38 |     with open(args.data_file, 'rb') as fp:
39 |         data = pickle.load(fp)
40 | 
41 |     logger.info('Counting occurences for MLM.')
42 |     counter = Counter()
43 |     for tk_ids in data:
44 |         counter.update(tk_ids)
45 |     counts = [0]*args.vocab_size
46 |     for k, v in counter.items():
47 |         counts[k] = v
48 | 
49 |     logger.info(f'Dump to {args.token_counts_dump}')
50 |     with open(args.token_counts_dump, 'wb') as handle:
51 |         pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
52 | 


--------------------------------------------------------------------------------
/dialogue_generation/eval_utils.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) Microsoft Corporation. 
 2 | #  Licensed under the MIT license. 
 3 | import torch
 4 | import logging
 5 | 
 6 | import numpy as np
 7 | 
 8 | from pycocoevalcap.bleu.bleu import Bleu
 9 | from collections import defaultdict
10 | 
11 | from itertools import chain
12 | 
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | def f1(generated, reference):
17 |     
18 |     f=[]
19 |     
20 |     for idx, g in enumerate(generated):
21 |         count=0
22 |         for x in g:
23 |             if x in reference[idx]:
24 |                 count+=1
25 |         if len(g)>0:
26 |             precision = count/len(g)
27 |         else:
28 |             precision=0
29 |         recall = count/len(reference[idx])
30 |         if recall!=0 and precision!=0:
31 |             f.append(2*precision*recall/(precision+recall))
32 |         else:
33 |             f.append(0)    
34 |     return np.array(f).mean()*100
35 | 
36 | 
37 | 
38 | 
39 | def pad_sequence(sequence, n, pad_left=False, pad_right=False, left_pad_symbol=None, right_pad_symbol=None):
40 |     
41 |     sequence = iter(sequence)
42 |     if pad_left:
43 |         sequence = chain((left_pad_symbol,) * (n - 1), sequence)
44 |     if pad_right:
45 |         sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
46 |     return sequence
47 | 
48 | 
49 | def ngrams(sequence, n, pad_left=False, pad_right=False, left_pad_symbol=None, right_pad_symbol=None):
50 | 
51 |     sequence = pad_sequence(sequence, n, pad_left, pad_right,
52 |                             left_pad_symbol, right_pad_symbol)
53 | 
54 |     history = []
55 |     while n > 1:
56 |         history.append(next(sequence))
57 |         n -= 1
58 |     for item in sequence:
59 |         history.append(item)
60 |         yield tuple(history)
61 |         del history[0]
62 | 
63 | def distinct(generated):
64 |     distinct_1 = distinct_n_corpus_level(generated, 1)
65 |     distinct_2 = distinct_n_corpus_level(generated, 2)
66 |     distinct_3 = distinct_n_corpus_level(generated, 3)
67 |     distinct_4 = distinct_n_corpus_level(generated, 4)
68 | 
69 |     return distinct_1, distinct_2, distinct_3, distinct_4
70 | 
71 | def distinct_n_sentence_level(sentence, n):
72 | 
73 |     if len(sentence) == 0:
74 |         return 0.0 
75 |     distinct_ngrams = set(ngrams(sentence, n))
76 |     return len(distinct_ngrams) / len(sentence)
77 | 
78 | 
79 | def distinct_n_corpus_level(sentences, n):
80 | 
81 |     return sum(distinct_n_sentence_level(sentence, n) for sentence in sentences) / len(sentences)


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/configuration_common_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 HuggingFace Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import copy
20 | import os
21 | import shutil
22 | import json
23 | import random
24 | import uuid
25 | 
26 | import unittest
27 | import logging
28 | 
29 | 
30 | class ConfigTester(object):
31 |     def __init__(self, parent, config_class=None, **kwargs):
32 |         self.parent = parent
33 |         self.config_class = config_class
34 |         self.inputs_dict = kwargs
35 | 
36 |     def create_and_test_config_common_properties(self):
37 |         config = self.config_class(**self.inputs_dict)
38 |         self.parent.assertTrue(hasattr(config, 'vocab_size'))
39 |         self.parent.assertTrue(hasattr(config, 'hidden_size'))
40 |         self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
41 |         self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
42 | 
43 |     def create_and_test_config_to_json_string(self):
44 |         config = self.config_class(**self.inputs_dict)
45 |         obj = json.loads(config.to_json_string())
46 |         for key, value in self.inputs_dict.items():
47 |             self.parent.assertEqual(obj[key], value)
48 | 
49 |     def create_and_test_config_to_json_file(self):
50 |         config_first = self.config_class(**self.inputs_dict)
51 |         json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
52 |         config_first.to_json_file(json_file_path)
53 |         config_second = self.config_class.from_json_file(json_file_path)
54 |         os.remove(json_file_path)
55 |         self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
56 | 
57 |     def run_common_tests(self):
58 |         self.create_and_test_config_common_properties()
59 |         self.create_and_test_config_to_json_string()
60 |         self.create_and_test_config_to_json_file()
61 | 
62 | if __name__ == "__main__":
63 |     unittest.main()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Sparse Text Generation
 3 | 
 4 | This is the repository for our paper [Sparse Text Generation](https://arxiv.org/abs/2004.02644).
 5 | 
 6 | # Installation
 7 | Before running the code, you need to install the dependencies by running the following lines.
 8 | ```
 9 | cd language_modeling
10 | pip3 install .
11 | ```
12 | 
13 | # Fine-tune GPT2 for Language Modeling
14 | 
15 | ### Training
16 | To fine-tune GPT2 for language modelling you just need to run the following command, modifying the parameters as you wish.
17 | ```
18 | python3 examples/run_lm_finetuning.py \
19 |         --train_data_file=/path/to/dataset/train \
20 |         --eval_data_file=/path/to/dataset/eval \
21 |         --output_dir=/path/to/output \
22 |         --model_type=gpt2 \
23 |         --model_name_or_path=gpt2-medium \
24 |         --block_size=512 \
25 |         --do_train \
26 |         --evaluate_during_training \
27 |         --loss=entmax \
28 |         --entmax_alpha=1.2 \
29 |         --top_k=0 \
30 |         --top_p=0
31 | ```
32 | 
33 | ### Evaluating
34 | To evaluate a model just run:
35 | ```
36 | python3 examples/run_lm_finetuning.py \
37 |         --train_data_file=/path/to/dataset/train \
38 |         --eval_data_file=/path/to/dataset/eval \
39 |         --output_dir=/path/to/output \
40 |         --model_type=gpt2 \
41 |         --model_name_or_path=/path/to/checkpoint_to_evaluate \
42 |         --block_size=512 \
43 |         --do_eval \
44 |         --loss=entmax \
45 |         --entmax_alpha=1.2 \
46 |         --top_k=0 \
47 |         --top_p=0
48 | ```
49 | 
50 | # Fine-tune GPT2 for Dialogue Generation
51 | 
52 | ### Training
53 | To fine-tune GPT2 for dialogue generation you just need to run the following command, modifying the parameters as you wish.
54 | ```
55 | python3 train.py 
56 |         --dataset_path=/path/to/dataset \
57 |         --model_checkpoint=gpt2-medium \
58 |         --name=name_you_want_to_give_to_model \
59 |         --loss=entmax \
60 |         --entmax_alpha=1.3 \ 
61 |         --top_p=0 \
62 |         --top_k=0
63 | ```
64 | ### Evaluating
65 | To evaluate a model just run:
66 | ```
67 | python3 eval.py 
68 |         --dataset_path=/path/to/dataset\
69 |         --model_type=gpt2-medium \
70 |         --name=name_you_want_to_give_to_model \
71 |         --model_checkpoint=/path/to/checkpoint_to_evaluate \
72 |         --loss=entmax \
73 |         --entmax_alpha=1.3 \ 
74 |         --top_p=0 \
75 |         --top_k=0
76 | ```
77 | 
78 | # Acknowledgment 
79 | A large portion of the code comes from the awesome Huggingface [Transformers](https://github.com/huggingface/transformers) library.
80 | 
81 | # Citation
82 | 
83 |     @inproceedings{martins20sparse,
84 |       author    = {Martins, Pedro Henrique and Marinho, Zita and  Martins, Andr{\'e} FT},
85 |       title     = {Sparse Text Generation},
86 |       booktitle = {Proc. EMNLP},
87 |       year      = {2020}
88 |     }
89 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | LICENSE
 2 | MANIFEST.in
 3 | README.md
 4 | setup.py
 5 | pytorch_transformers/__init__.py
 6 | pytorch_transformers/__main__.py
 7 | pytorch_transformers/configuration_auto.py
 8 | pytorch_transformers/configuration_bert.py
 9 | pytorch_transformers/configuration_distilbert.py
10 | pytorch_transformers/configuration_gpt2.py
11 | pytorch_transformers/configuration_openai.py
12 | pytorch_transformers/configuration_roberta.py
13 | pytorch_transformers/configuration_transfo_xl.py
14 | pytorch_transformers/configuration_utils.py
15 | pytorch_transformers/configuration_xlm.py
16 | pytorch_transformers/configuration_xlnet.py
17 | pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
18 | pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
19 | pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
20 | pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
21 | pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
22 | pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
23 | pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
24 | pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
25 | pytorch_transformers/file_utils.py
26 | pytorch_transformers/modeling_auto.py
27 | pytorch_transformers/modeling_bert.py
28 | pytorch_transformers/modeling_distilbert.py
29 | pytorch_transformers/modeling_gpt2.py
30 | pytorch_transformers/modeling_openai.py
31 | pytorch_transformers/modeling_roberta.py
32 | pytorch_transformers/modeling_transfo_xl.py
33 | pytorch_transformers/modeling_transfo_xl_utilities.py
34 | pytorch_transformers/modeling_utils.py
35 | pytorch_transformers/modeling_xlm.py
36 | pytorch_transformers/modeling_xlnet.py
37 | pytorch_transformers/optimization.py
38 | pytorch_transformers/tokenization_albert.py
39 | pytorch_transformers/tokenization_auto.py
40 | pytorch_transformers/tokenization_bart.py
41 | pytorch_transformers/tokenization_bert.py
42 | pytorch_transformers/tokenization_bert_japanese.py
43 | pytorch_transformers/tokenization_camembert.py
44 | pytorch_transformers/tokenization_ctrl.py
45 | pytorch_transformers/tokenization_distilbert.py
46 | pytorch_transformers/tokenization_flaubert.py
47 | pytorch_transformers/tokenization_gpt2.py
48 | pytorch_transformers/tokenization_openai.py
49 | pytorch_transformers/tokenization_roberta.py
50 | pytorch_transformers/tokenization_t5.py
51 | pytorch_transformers/tokenization_transfo_xl.py
52 | pytorch_transformers/tokenization_utils.py
53 | pytorch_transformers/tokenization_xlm.py
54 | pytorch_transformers/tokenization_xlm_roberta.py
55 | pytorch_transformers/tokenization_xlnet.py
56 | pytorch_transformers.egg-info/PKG-INFO
57 | pytorch_transformers.egg-info/SOURCES.txt
58 | pytorch_transformers.egg-info/dependency_links.txt
59 | pytorch_transformers.egg-info/entry_points.txt
60 | pytorch_transformers.egg-info/requires.txt
61 | pytorch_transformers.egg-info/top_level.txt


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import argparse
22 | import torch
23 | 
24 | from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
25 | 
26 | import logging
27 | logging.basicConfig(level=logging.INFO)
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = BertConfig.from_json_file(bert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = BertForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_bert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     ## Required parameters
46 |     parser.add_argument("--tf_checkpoint_path",
47 |                         default = None,
48 |                         type = str,
49 |                         required = True,
50 |                         help = "Path to the TensorFlow checkpoint path.")
51 |     parser.add_argument("--bert_config_file",
52 |                         default = None,
53 |                         type = str,
54 |                         required = True,
55 |                         help = "The config json file corresponding to the pre-trained BERT model. \n"
56 |                             "This specifies the model architecture.")
57 |     parser.add_argument("--pytorch_dump_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the output PyTorch model.")
62 |     args = parser.parse_args()
63 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
64 |                                      args.bert_config_file,
65 |                                      args.pytorch_dump_path)
66 | 


--------------------------------------------------------------------------------
/dialogue_generation/pycocoevalcap/eval.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'tylin'
 2 | from tokenizer.ptbtokenizer import PTBTokenizer
 3 | from bleu.bleu import Bleu
 4 | from meteor.meteor import Meteor
 5 | from rouge.rouge import Rouge
 6 | from cider.cider import Cider
 7 | 
 8 | class COCOEvalCap:
 9 |     def __init__(self, coco, cocoRes):
10 |         self.evalImgs = []
11 |         self.eval = {}
12 |         self.imgToEval = {}
13 |         self.coco = coco
14 |         self.cocoRes = cocoRes
15 |         self.params = {'image_id': coco.getImgIds()}
16 | 
17 |     def evaluate(self):
18 |         imgIds = self.params['image_id']
19 |         # imgIds = self.coco.getImgIds()
20 |         gts = {}
21 |         res = {}
22 |         for imgId in imgIds:
23 |             gts[imgId] = self.coco.imgToAnns[imgId]
24 |             res[imgId] = self.cocoRes.imgToAnns[imgId]
25 | 
26 |         # =================================================
27 |         # Set up scorers
28 |         # =================================================
29 |         print 'tokenization...'
30 |         tokenizer = PTBTokenizer()
31 |         gts  = tokenizer.tokenize(gts)
32 |         res = tokenizer.tokenize(res)
33 | 
34 |         # =================================================
35 |         # Set up scorers
36 |         # =================================================
37 |         print 'setting up scorers...'
38 |         scorers = [
39 |             (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
40 |             (Meteor(),"METEOR"),
41 |             (Rouge(), "ROUGE_L"),
42 |             (Cider(), "CIDEr")
43 |         ]
44 | 
45 |         # =================================================
46 |         # Compute scores
47 |         # =================================================
48 |         for scorer, method in scorers:
49 |             print 'computing %s score...'%(scorer.method())
50 |             score, scores = scorer.compute_score(gts, res)
51 |             if type(method) == list:
52 |                 for sc, scs, m in zip(score, scores, method):
53 |                     self.setEval(sc, m)
54 |                     self.setImgToEvalImgs(scs, gts.keys(), m)
55 |                     print "%s: %0.3f"%(m, sc)
56 |             else:
57 |                 self.setEval(score, method)
58 |                 self.setImgToEvalImgs(scores, gts.keys(), method)
59 |                 print "%s: %0.3f"%(method, score)
60 |         self.setEvalImgs()
61 | 
62 |     def setEval(self, score, method):
63 |         self.eval[method] = score
64 | 
65 |     def setImgToEvalImgs(self, scores, imgIds, method):
66 |         for imgId, score in zip(imgIds, scores):
67 |             if not imgId in self.imgToEval:
68 |                 self.imgToEval[imgId] = {}
69 |                 self.imgToEval[imgId]["image_id"] = imgId
70 |             self.imgToEval[imgId][method] = score
71 | 
72 |     def setEvalImgs(self):
73 |         self.evalImgs = [eval for imgId, eval in self.imgToEval.items()]


--------------------------------------------------------------------------------
/language_modeling/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py
 3 | 
 4 | To create the package for pypi.
 5 | 
 6 | 1. Change the version in __init__.py and setup.py.
 7 | 
 8 | 2. Commit these changes with the message: "Release: VERSION"
 9 | 
10 | 3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' "
11 |    Push the tag to git: git push --tags origin master
12 | 
13 | 4. Build both the sources and the wheel. Do not change anything in setup.py between
14 |    creating the wheel and the source distribution (obviously).
15 | 
16 |    For the wheel, run: "python setup.py bdist_wheel" in the top level allennlp directory.
17 |    (this will build a wheel for the python version you use to build it - make sure you use python 3.x).
18 | 
19 |    For the sources, run: "python setup.py sdist"
20 |    You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp.
21 | 
22 | 5. Check that everything looks correct by uploading the package to the pypi test server:
23 | 
24 |    twine upload dist/* -r pypitest
25 |    (pypi suggest using twine as other methods upload files via plaintext.)
26 | 
27 |    Check that you can install it in a virtualenv by running:
28 |    pip install -i https://testpypi.python.org/pypi pytorch-transformers
29 | 
30 | 6. Upload the final version to actual pypi:
31 |    twine upload dist/* -r pypi
32 | 
33 | 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
34 | 
35 | """
36 | from io import open
37 | from setuptools import find_packages, setup
38 | 
39 | setup(
40 |     name="pytorch_transformers",
41 |     version="1.2.0",
42 |     description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
43 | 
44 |     
45 |     keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',
46 |     license='Apache',
47 |     url="https://github.com/huggingface/pytorch-transformers",
48 |     packages=find_packages(exclude=["*.tests", "*.tests.*",
49 |                                     "tests.*", "tests"]),
50 |     install_requires=['torch>=1.0.0',
51 |                       'numpy',
52 |                       'boto3',
53 |                       'requests',
54 |                       'tqdm',
55 |                       'regex',
56 |                       'sentencepiece',
57 |                       'sacremoses'],
58 |     entry_points={
59 |       'console_scripts': [
60 |         "pytorch_transformers=pytorch_transformers.__main__:main",
61 |       ]
62 |     },
63 |     # python_requires='>=3.5.0',
64 |     tests_require=['pytest'],
65 |     classifiers=[
66 |           'Intended Audience :: Science/Research',
67 |           'License :: OSI Approved :: Apache Software License',
68 |           'Programming Language :: Python :: 3',
69 |           'Topic :: Scientific/Engineering :: Artificial Intelligence',
70 |     ],
71 | )
72 | 


--------------------------------------------------------------------------------
/dialogue_generation/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-present, HuggingFace Inc.
 2 | # All rights reserved. This source code is licensed under the BSD-style license found in the
 3 | # LICENSE file in the root directory of this source tree.
 4 | from datetime import datetime
 5 | import json
 6 | import logging
 7 | import os
 8 | import tarfile
 9 | import tempfile
10 | import socket
11 | 
12 | import torch
13 | 
14 | from pytorch_transformers import cached_path
15 | 
16 | PERSONACHAT_URL = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
17 | HF_FINETUNED_MODEL = "https://s3.amazonaws.com/models.huggingface.co/transfer-learning-chatbot/gpt_personachat_cache.tar.gz"
18 | 
19 | logger = logging.getLogger(__file__)
20 | 
21 | def download_pretrained_model():
22 |     """ Download and extract finetuned model from S3 """
23 |     resolved_archive_file = cached_path(HF_FINETUNED_MODEL)
24 |     tempdir = tempfile.mkdtemp()
25 |     #logger.info("extracting archive file {} to temp dir {}".format(resolved_archive_file, tempdir))
26 |     with tarfile.open(resolved_archive_file, 'r:gz') as archive:
27 |         archive.extractall(tempdir)
28 |     return tempdir
29 | 
30 | 
31 | def get_dataset(tokenizer, dataset_path, dataset_cache):
32 |     """ Get tokenized PERSONACHAT dataset from S3 or cache."""
33 |     dataset_path = dataset_path or PERSONACHAT_URL
34 |     dataset_cache = dataset_cache + '_' + type(tokenizer).__name__  # To avoid using GPT cache for GPT-2 and vice-versa
35 |     if dataset_cache and os.path.isfile(dataset_cache):
36 |         logger.info("Load tokenized dataset from cache at %s", dataset_cache)
37 |         dataset = torch.load(dataset_cache)
38 |     else:
39 |         logger.info("Download dataset from %s", dataset_path)
40 |         personachat_file = cached_path(dataset_path)
41 |         with open(personachat_file, "r", encoding="utf-8") as f:
42 |             dataset = json.loads(f.read())
43 | 
44 |         logger.info("Tokenize and encode the dataset")
45 |         def tokenize(obj):
46 |             if isinstance(obj, str):
47 |                 return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
48 |             if isinstance(obj, dict):
49 |                 return dict((n, tokenize(o)) for n, o in obj.items())
50 |             return list(tokenize(o) for o in obj)
51 |         dataset = tokenize(dataset)
52 |         torch.save(dataset, dataset_cache)
53 |     return dataset
54 | 
55 | 
56 | class AttrDict(dict):
57 |     def __init__(self, *args, **kwargs):
58 |         super(AttrDict, self).__init__(*args, **kwargs)
59 |         self.__dict__ = self
60 | 
61 | 
62 | def make_logdir(model_name: str):
63 |     """Create unique path to save results and checkpoints, e.g. runs/Sep22_19-45-59_gpu-7_gpt2"""
64 |     # Code copied from ignite repo
65 |     current_time = datetime.now().strftime('%b%d_%H-%M-%S')
66 |     logdir = os.path.join(
67 |         'runs', current_time + '_' + socket.gethostname() + '_' + model_name)
68 |     return logdir
69 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/tokenization_openai_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import json
20 | 
21 | from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
22 | 
23 | from .tokenization_tests_commons import CommonTestCases
24 | 
25 | 
26 | class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
27 | 
28 |     tokenizer_class = OpenAIGPTTokenizer
29 | 
30 |     def setUp(self):
31 |         super(OpenAIGPTTokenizationTest, self).setUp()
32 | 
33 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
34 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
35 |                  "w</w>", "r</w>", "t</w>",
36 |                  "lo", "low", "er</w>",
37 |                  "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
38 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
39 |         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
40 | 
41 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
42 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
43 |         with open(self.vocab_file, "w") as fp:
44 |             fp.write(json.dumps(vocab_tokens))
45 |         with open(self.merges_file, "w") as fp:
46 |             fp.write("\n".join(merges))
47 | 
48 |     def get_tokenizer(self, **kwargs):
49 |         return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs)
50 | 
51 |     def get_input_output_texts(self):
52 |         input_text = u"lower newer"
53 |         output_text = u"lower newer"
54 |         return input_text, output_text
55 | 
56 | 
57 |     def test_full_tokenizer(self):
58 |         tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
59 | 
60 |         text = "lower"
61 |         bpe_tokens = ["low", "er</w>"]
62 |         tokens = tokenizer.tokenize(text)
63 |         self.assertListEqual(tokens, bpe_tokens)
64 | 
65 |         input_tokens = tokens + ["<unk>"]
66 |         input_bpe_tokens = [14, 15, 20]
67 |         self.assertListEqual(
68 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/dialogue_generation/pycocoevalcap/tokenizer/ptbtokenizer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # 
 3 | # File Name : ptbtokenizer.py
 4 | #
 5 | # Description : Do the PTB Tokenization and remove punctuations.
 6 | #
 7 | # Creation Date : 29-12-2014
 8 | # Last Modified : Thu Mar 19 09:53:35 2015
 9 | 
10 | import os
11 | import sys
12 | import subprocess
13 | import tempfile
14 | import itertools
15 | 
16 | # path to the stanford corenlp jar
17 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar'
18 | 
19 | # punctuations to be removed from the sentences
20 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
21 |         ".", "?", "!", ",", ":", "-", "--", "...", ";"] 
22 | 
23 | class PTBTokenizer:
24 |     """Python wrapper of Stanford PTBTokenizer"""
25 | 
26 |     def tokenize(self, captions_for_image):
27 |         cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \
28 |                 'edu.stanford.nlp.process.PTBTokenizer', \
29 |                 '-preserveLines', '-lowerCase']
30 | 
31 |         # ======================================================
32 |         # prepare data for PTB Tokenizer
33 |         # ======================================================
34 |         final_tokenized_captions_for_image = {}
35 |         image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
36 |         sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
37 | 
38 |         # ======================================================
39 |         # save sentences to temporary file
40 |         # ======================================================
41 |         path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
42 |         tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
43 |         tmp_file.write(sentences)
44 |         tmp_file.close()
45 | 
46 |         # ======================================================
47 |         # tokenize sentence
48 |         # ======================================================
49 |         cmd.append(os.path.basename(tmp_file.name))
50 |         p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \
51 |                 stdout=subprocess.PIPE)
52 |         token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
53 |         lines = token_lines.split('\n')
54 |         # remove temp file
55 |         os.remove(tmp_file.name)
56 | 
57 |         # ======================================================
58 |         # create dictionary for tokenized captions
59 |         # ======================================================
60 |         for k, line in zip(image_id, lines):
61 |             if not k in final_tokenized_captions_for_image:
62 |                 final_tokenized_captions_for_image[k] = []
63 |             tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
64 |                     if w not in PUNCTUATIONS])
65 |             final_tokenized_captions_for_image[k].append(tokenized_caption)
66 | 
67 |         return final_tokenized_captions_for_image
68 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/tokenization_transfo_xl_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | from io import open
20 | 
21 | from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
22 | 
23 | from.tokenization_tests_commons import CommonTestCases
24 | 
25 | class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
26 | 
27 |     tokenizer_class = TransfoXLTokenizer
28 | 
29 |     def setUp(self):
30 |         super(TransfoXLTokenizationTest, self).setUp()
31 | 
32 |         vocab_tokens = [
33 |             "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
34 |             "running", ",", "low", "l",
35 |         ]
36 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
37 |         with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
38 |             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
39 | 
40 |     def get_tokenizer(self, **kwargs):
41 |         kwargs['lower_case'] = True
42 |         return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
43 | 
44 |     def get_input_output_texts(self):
45 |         input_text = u"<unk> UNwanted , running"
46 |         output_text = u"<unk> unwanted, running"
47 |         return input_text, output_text
48 | 
49 |     def test_full_tokenizer(self):
50 |         tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
51 | 
52 |         tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
53 |         self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
54 | 
55 |         self.assertListEqual(
56 |             tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
57 | 
58 |     def test_full_tokenizer_lower(self):
59 |         tokenizer = TransfoXLTokenizer(lower_case=True)
60 | 
61 |         self.assertListEqual(
62 |             tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
63 |             ["hello", "!", "how", "are", "you", "?"])
64 | 
65 |     def test_full_tokenizer_no_lower(self):
66 |         tokenizer = TransfoXLTokenizer(lower_case=False)
67 | 
68 |         self.assertListEqual(
69 |             tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
70 |             ["HeLLo", "!", "how", "Are", "yoU", "?"])
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     unittest.main()
75 | 


--------------------------------------------------------------------------------
/language_modeling/examples/distillation/scripts/binarized_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """
16 | Preprocessing script before training DistilBERT.
17 | """
18 | import argparse
19 | import pickle
20 | import random
21 | import time
22 | import numpy as np
23 | from pytorch_transformers import BertTokenizer
24 | import logging
25 | 
26 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
27 |                     datefmt = '%m/%d/%Y %H:%M:%S',
28 |                     level = logging.INFO)
29 | logger = logging.getLogger(__name__)
30 | 
31 | def main():
32 |     parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
33 |     parser.add_argument('--file_path', type=str, default='data/dump.txt',
34 |                         help='The path to the data.')
35 |     parser.add_argument('--bert_tokenizer', type=str, default='bert-base-uncased',
36 |                         help="The tokenizer to use.")
37 |     parser.add_argument('--dump_file', type=str, default='data/dump',
38 |                         help='The dump file prefix.')
39 |     args = parser.parse_args()
40 | 
41 | 
42 |     logger.info(f'Loading Tokenizer ({args.bert_tokenizer})')
43 |     bert_tokenizer = BertTokenizer.from_pretrained(args.bert_tokenizer)
44 | 
45 | 
46 |     logger.info(f'Loading text from {args.file_path}')
47 |     with open(args.file_path, 'r', encoding='utf8') as fp:
48 |         data = fp.readlines()
49 | 
50 | 
51 |     logger.info(f'Start encoding')
52 |     logger.info(f'{len(data)} examples to process.')
53 | 
54 |     rslt = []
55 |     iter = 0
56 |     interval = 10000
57 |     start = time.time()
58 |     for text in data:
59 |         text = f'[CLS] {text.strip()} [SEP]'
60 |         token_ids = bert_tokenizer.encode(text)
61 |         rslt.append(token_ids)
62 | 
63 |         iter += 1
64 |         if iter % interval == 0:
65 |             end = time.time()
66 |             logger.info(f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl')
67 |             start = time.time()
68 |     logger.info('Finished binarization')
69 |     logger.info(f'{len(data)} examples processed.')
70 | 
71 | 
72 |     dp_file = f'{args.dump_file}.{args.bert_tokenizer}.pickle'
73 |     rslt_ = [np.uint16(d) for d in rslt]
74 |     random.shuffle(rslt_)
75 |     logger.info(f'Dump to {dp_file}')
76 |     with open(dp_file, 'wb') as handle:
77 |         pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/dialogue_generation/pycocoevalcap/meteor/meteor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | import os
 5 | import sys
 6 | import subprocess
 7 | import threading
 8 | 
 9 | # Assumes meteor-1.5.jar is in the same directory as meteor.py.  Change as needed.
10 | METEOR_JAR = 'meteor-1.5.jar'
11 | # print METEOR_JAR
12 | 
13 | class Meteor:
14 | 
15 |     def __init__(self):
16 |         self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \
17 |                 '-', '-', '-stdio', '-l', 'en', '-norm']
18 |         self.meteor_p = subprocess.Popen(self.meteor_cmd, \
19 |                 cwd=os.path.dirname(os.path.abspath(__file__)), \
20 |                 stdin=subprocess.PIPE, \
21 |                 stdout=subprocess.PIPE, \
22 |                 stderr=subprocess.PIPE)
23 |         # Used to guarantee thread safety
24 |         self.lock = threading.Lock()
25 | 
26 |     def compute_score(self, gts, res):
27 |         assert(gts.keys() == res.keys())
28 |         imgIds = gts.keys()
29 |         scores = []
30 | 
31 |         eval_line = 'EVAL'
32 |         self.lock.acquire()
33 |         for i in imgIds:
34 |             assert(len(res[i]) == 1)
35 |             stat = self._stat(res[i][0], gts[i])
36 |             eval_line += ' ||| {}'.format(stat)
37 | 
38 |         self.meteor_p.stdin.write('{}\n'.format(eval_line))
39 |         for i in range(0,len(imgIds)):
40 |             scores.append(float(self.meteor_p.stdout.readline().strip()))
41 |         score = float(self.meteor_p.stdout.readline().strip())
42 |         self.lock.release()
43 | 
44 |         return score, scores
45 | 
46 |     def method(self):
47 |         return "METEOR"
48 | 
49 |     def _stat(self, hypothesis_str, reference_list):
50 |         # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
51 |         hypothesis_str = hypothesis_str.replace('|||','').replace('  ',' ')
52 |         score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
53 |         self.meteor_p.stdin.write('{}\n'.format(score_line))
54 |         return self.meteor_p.stdout.readline().strip()
55 | 
56 |     def _score(self, hypothesis_str, reference_list):
57 |         self.lock.acquire()
58 |         # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
59 |         hypothesis_str = hypothesis_str.replace('|||','').replace('  ',' ')
60 |         score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
61 |         self.meteor_p.stdin.write('{}\n'.format(score_line))
62 |         stats = self.meteor_p.stdout.readline().strip()
63 |         eval_line = 'EVAL ||| {}'.format(stats)
64 |         # EVAL ||| stats 
65 |         self.meteor_p.stdin.write('{}\n'.format(eval_line))
66 |         score = float(self.meteor_p.stdout.readline().strip())
67 |         # bug fix: there are two values returned by the jar file, one average, and one all, so do it twice
68 |         # thanks for Andrej for pointing this out
69 |         score = float(self.meteor_p.stdout.readline().strip())
70 |         self.lock.release()
71 |         return score
72 |  
73 |     def __exit__(self):
74 |         self.lock.acquire()
75 |         self.meteor_p.stdin.close()
76 |         self.meteor_p.kill()
77 |         self.meteor_p.wait()
78 |         self.lock.release()
79 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/tokenization_gpt2_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import json
20 | from io import open
21 | 
22 | from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
23 | 
24 | from .tokenization_tests_commons import CommonTestCases
25 | 
26 | class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
27 | 
28 |     tokenizer_class = GPT2Tokenizer
29 | 
30 |     def setUp(self):
31 |         super(GPT2TokenizationTest, self).setUp()
32 | 
33 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
34 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
35 |                  "\u0120", "\u0120l", "\u0120n",
36 |                  "\u0120lo", "\u0120low", "er",
37 |                  "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
38 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
39 |         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
40 |         self.special_tokens_map = {"unk_token": "<unk>"}
41 | 
42 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
43 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
44 |         with open(self.vocab_file, "w", encoding="utf-8") as fp:
45 |             fp.write(json.dumps(vocab_tokens) + "\n")
46 |         with open(self.merges_file, "w", encoding="utf-8") as fp:
47 |             fp.write("\n".join(merges))
48 | 
49 |     def get_tokenizer(self, **kwargs):
50 |         kwargs.update(self.special_tokens_map)
51 |         return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
52 | 
53 |     def get_input_output_texts(self):
54 |         input_text = u"lower newer"
55 |         output_text = u" lower newer"
56 |         return input_text, output_text
57 | 
58 |     def test_full_tokenizer(self):
59 |         tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
60 |         text = "lower newer"
61 |         bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
62 |         tokens = tokenizer.tokenize(text)
63 |         self.assertListEqual(tokens, bpe_tokens)
64 | 
65 |         input_tokens = tokens + [tokenizer.unk_token]
66 |         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
67 |         self.assertListEqual(
68 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | import json
21 | from io import open
22 | 
23 | import torch
24 | import numpy
25 | 
26 | from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
27 | from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
33 |     # Load checkpoint
34 |     chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
35 | 
36 |     model = chkpt['model']
37 | 
38 |     config = chkpt['params']
39 |     config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
40 | 
41 |     vocab = chkpt['dico_word2id']
42 |     vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items())
43 | 
44 |     # Save pytorch-model
45 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
46 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
47 |     pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
48 | 
49 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
50 |     torch.save(model, pytorch_weights_dump_path)
51 | 
52 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
53 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
54 |         f.write(json.dumps(config, indent=2) + "\n")
55 | 
56 |     print("Save vocab file to {}".format(pytorch_config_dump_path))
57 |     with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
58 |         f.write(json.dumps(vocab, indent=2) + "\n")
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     parser = argparse.ArgumentParser()
63 |     ## Required parameters
64 |     parser.add_argument("--xlm_checkpoint_path",
65 |                         default = None,
66 |                         type = str,
67 |                         required = True,
68 |                         help = "Path the official PyTorch dump.")
69 |     parser.add_argument("--pytorch_dump_folder_path",
70 |                         default = None,
71 |                         type = str,
72 |                         required = True,
73 |                         help = "Path to the output PyTorch model.")
74 |     args = parser.parse_args()
75 |     convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
76 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | from io import open
21 | 
22 | import torch
23 | 
24 | from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
25 |                                                      GPT2Config,
26 |                                                      GPT2Model,
27 |                                                      load_tf_weights_in_gpt2)
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | 
33 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
34 |     # Construct model
35 |     if gpt2_config_file == "":
36 |         config = GPT2Config()
37 |     else:
38 |         config = GPT2Config.from_json_file(gpt2_config_file)
39 |     model = GPT2Model(config)
40 | 
41 |     # Load weights from numpy
42 |     load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
43 | 
44 |     # Save pytorch-model
45 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
46 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
47 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
48 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
49 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
50 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
51 |         f.write(config.to_json_string())
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     parser = argparse.ArgumentParser()
56 |     ## Required parameters
57 |     parser.add_argument("--gpt2_checkpoint_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the TensorFlow checkpoint path.")
62 |     parser.add_argument("--pytorch_dump_folder_path",
63 |                         default = None,
64 |                         type = str,
65 |                         required = True,
66 |                         help = "Path to the output PyTorch model.")
67 |     parser.add_argument("--gpt2_config_file",
68 |                         default = "",
69 |                         type = str,
70 |                         help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
71 |                             "This specifies the model architecture.")
72 |     args = parser.parse_args()
73 |     convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path,
74 |                                          args.gpt2_config_file,
75 |                                          args.pytorch_dump_folder_path)
76 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | from io import open
21 | 
22 | import torch
23 | 
24 | from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
25 |                                                      OpenAIGPTConfig,
26 |                                                      OpenAIGPTModel,
27 |                                                      load_tf_weights_in_openai_gpt)
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | 
33 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
34 |     # Construct model
35 |     if openai_config_file == "":
36 |         config = OpenAIGPTConfig()
37 |     else:
38 |         config = OpenAIGPTConfig.from_json_file(openai_config_file)
39 |     model = OpenAIGPTModel(config)
40 | 
41 |     # Load weights from numpy
42 |     load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
43 | 
44 |     # Save pytorch-model
45 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
46 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
47 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
48 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
49 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
50 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
51 |         f.write(config.to_json_string())
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     parser = argparse.ArgumentParser()
56 |     ## Required parameters
57 |     parser.add_argument("--openai_checkpoint_folder_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the TensorFlow checkpoint path.")
62 |     parser.add_argument("--pytorch_dump_folder_path",
63 |                         default = None,
64 |                         type = str,
65 |                         required = True,
66 |                         help = "Path to the output PyTorch model.")
67 |     parser.add_argument("--openai_config_file",
68 |                         default = "",
69 |                         type = str,
70 |                         help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
71 |                             "This specifies the model architecture.")
72 |     args = parser.parse_args()
73 |     convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
74 |                                          args.openai_config_file,
75 |                                          args.pytorch_dump_folder_path)
76 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/tokenization_xlm_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import json
20 | 
21 | from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
22 | 
23 | from .tokenization_tests_commons import CommonTestCases
24 | 
25 | class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
26 | 
27 |     tokenizer_class = XLMTokenizer
28 | 
29 |     def setUp(self):
30 |         super(XLMTokenizationTest, self).setUp()
31 | 
32 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
33 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
34 |                  "w</w>", "r</w>", "t</w>",
35 |                  "lo", "low", "er</w>",
36 |                  "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
37 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
38 |         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
39 | 
40 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
41 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
42 |         with open(self.vocab_file, "w") as fp:
43 |             fp.write(json.dumps(vocab_tokens))
44 |         with open(self.merges_file, "w") as fp:
45 |             fp.write("\n".join(merges))
46 | 
47 |     def get_tokenizer(self, **kwargs):
48 |         return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
49 | 
50 |     def get_input_output_texts(self):
51 |         input_text = u"lower newer"
52 |         output_text = u"lower newer"
53 |         return input_text, output_text
54 | 
55 |     def test_full_tokenizer(self):
56 |         """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
57 |         tokenizer = XLMTokenizer(self.vocab_file, self.merges_file)
58 | 
59 |         text = "lower"
60 |         bpe_tokens = ["low", "er</w>"]
61 |         tokens = tokenizer.tokenize(text)
62 |         self.assertListEqual(tokens, bpe_tokens)
63 | 
64 |         input_tokens = tokens + ["<unk>"]
65 |         input_bpe_tokens = [14, 15, 20]
66 |         self.assertListEqual(
67 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
68 | 
69 |     def test_sequence_builders(self):
70 |         tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
71 | 
72 |         text = tokenizer.encode("sequence builders")
73 |         text_2 = tokenizer.encode("multi-sequence build")
74 | 
75 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
76 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
77 | 
78 |         assert encoded_sentence == [1] + text + [1]
79 |         assert encoded_pair == [1] + text + [1] + text_2 + [1]
80 | 
81 | if __name__ == '__main__':
82 |     unittest.main()
83 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/configuration_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ DistilBERT model configuration """
16 | from __future__ import (absolute_import, division, print_function,
17 |                         unicode_literals)
18 | 
19 | import sys
20 | import json
21 | import logging
22 | from io import open
23 | 
24 | from .configuration_utils import PretrainedConfig
25 | 
26 | logger = logging.getLogger(__name__)
27 | 
28 | DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
29 |     'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
30 |     'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
31 | }
32 | 
33 | 
34 | class DistilBertConfig(PretrainedConfig):
35 |     pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
36 | 
37 |     def __init__(self,
38 |                  vocab_size_or_config_json_file=30522,
39 |                  max_position_embeddings=512,
40 |                  sinusoidal_pos_embds=True,
41 |                  n_layers=6,
42 |                  n_heads=12,
43 |                  dim=768,
44 |                  hidden_dim=4*768,
45 |                  dropout=0.1,
46 |                  attention_dropout=0.1,
47 |                  activation='gelu',
48 |                  initializer_range=0.02,
49 |                  tie_weights_=True,
50 |                  qa_dropout=0.1,
51 |                  seq_classif_dropout=0.2,
52 |                  **kwargs):
53 |         super(DistilBertConfig, self).__init__(**kwargs)
54 | 
55 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
56 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
57 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
58 |                 json_config = json.loads(reader.read())
59 |             for key, value in json_config.items():
60 |                 self.__dict__[key] = value
61 |         elif isinstance(vocab_size_or_config_json_file, int):
62 |             self.vocab_size = vocab_size_or_config_json_file
63 |             self.max_position_embeddings = max_position_embeddings
64 |             self.sinusoidal_pos_embds = sinusoidal_pos_embds
65 |             self.n_layers = n_layers
66 |             self.n_heads = n_heads
67 |             self.dim = dim
68 |             self.hidden_dim = hidden_dim
69 |             self.dropout = dropout
70 |             self.attention_dropout = attention_dropout
71 |             self.activation = activation
72 |             self.initializer_range = initializer_range
73 |             self.tie_weights_ = tie_weights_
74 |             self.qa_dropout = qa_dropout
75 |             self.seq_classif_dropout = seq_classif_dropout
76 |         else:
77 |             raise ValueError("First argument must be either a vocabulary size (int)"
78 |                              " or the path to a pretrained model config file (str)")
79 |     @property
80 |     def hidden_size(self):
81 |         return self.dim
82 | 
83 |     @property
84 |     def num_attention_heads(self):
85 |         return self.n_heads
86 | 
87 |     @property
88 |     def num_hidden_layers(self):
89 |         return self.n_layers
90 | 


--------------------------------------------------------------------------------
/dialogue_generation/pycocoevalcap/rouge/rouge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # 
  3 | # File Name : rouge.py
  4 | #
  5 | #
  6 | # Creation Date : 2015-01-07 06:03
  7 | 
  8 | import numpy as np
  9 | import pdb
 10 | 
 11 | def my_lcs(string, sub):
 12 |     """
 13 |     Calculates longest common subsequence for a pair of tokenized strings
 14 |     :param string : list of str : tokens from a string split using whitespace
 15 |     :param sub : list of str : shorter string, also split using whitespace
 16 |     :returns: length (list of int): length of the longest common subsequence between the two strings
 17 | 
 18 |     Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
 19 |     """
 20 |     if(len(string)< len(sub)):
 21 |         sub, string = string, sub
 22 | 
 23 |     lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
 24 | 
 25 |     for j in range(1,len(sub)+1):
 26 |         for i in range(1,len(string)+1):
 27 |             if(string[i-1] == sub[j-1]):
 28 |                 lengths[i][j] = lengths[i-1][j-1] + 1
 29 |             else:
 30 |                 lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
 31 | 
 32 |     return lengths[len(string)][len(sub)]
 33 | 
 34 | class Rouge():
 35 |     '''
 36 |     Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
 37 | 
 38 |     '''
 39 |     def __init__(self):
 40 |         # vrama91: updated the value below based on discussion with Hovey
 41 |         self.beta = 1.2
 42 | 
 43 |     def calc_score(self, candidate, refs):
 44 |         """
 45 |         Compute ROUGE-L score given one candidate and references for an image
 46 |         :param candidate: str : candidate sentence to be evaluated
 47 |         :param refs: list of str : COCO reference sentences for the particular image to be evaluated
 48 |         :returns score: int (ROUGE-L score for the candidate evaluated against references)
 49 |         """
 50 |         assert(len(candidate)==1)	
 51 |         assert(len(refs)>0)         
 52 |         prec = []
 53 |         rec = []
 54 | 
 55 |         # split into tokens
 56 |         token_c = candidate[0].split(" ")
 57 |     	
 58 |         for reference in refs:
 59 |             # split into tokens
 60 |             token_r = reference.split(" ")
 61 |             # compute the longest common subsequence
 62 |             lcs = my_lcs(token_r, token_c)
 63 |             prec.append(lcs/float(len(token_c)))
 64 |             rec.append(lcs/float(len(token_r)))
 65 | 
 66 |         prec_max = max(prec)
 67 |         rec_max = max(rec)
 68 | 
 69 |         if(prec_max!=0 and rec_max !=0):
 70 |             score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
 71 |         else:
 72 |             score = 0.0
 73 |         return score
 74 | 
 75 |     def compute_score(self, gts, res):
 76 |         """
 77 |         Computes Rouge-L score given a set of reference and candidate sentences for the dataset
 78 |         Invoked by evaluate_captions.py 
 79 |         :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 
 80 |         :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
 81 |         :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
 82 |         """
 83 |         assert(gts.keys() == res.keys())
 84 |         imgIds = gts.keys()
 85 | 
 86 |         score = []
 87 |         for id in imgIds:
 88 |             hypo = res[id]
 89 |             ref  = gts[id]
 90 | 
 91 |             score.append(self.calc_score(hypo, ref))
 92 | 
 93 |             # Sanity check.
 94 |             assert(type(hypo) is list)
 95 |             assert(len(hypo) == 1)
 96 |             assert(type(ref) is list)
 97 |             assert(len(ref) > 0)
 98 | 
 99 |         average_score = np.mean(np.array(score))
100 |         return average_score, np.array(score)
101 | 
102 |     def method(self):
103 |         return "Rouge"
104 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tokenization_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for DistilBERT."""
16 | 
17 | 
18 | import logging
19 | 
20 | from .tokenization_bert import BertTokenizer, BertTokenizerFast
21 | 
22 | 
23 | logger = logging.getLogger(__name__)
24 | 
25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
26 | 
27 | PRETRAINED_VOCAB_FILES_MAP = {
28 |     "vocab_file": {
29 |         "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
30 |         "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
31 |         "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
32 |         "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
33 |         "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
34 |         "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
35 |     }
36 | }
37 | 
38 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
39 |     "distilbert-base-uncased": 512,
40 |     "distilbert-base-uncased-distilled-squad": 512,
41 |     "distilbert-base-cased": 512,
42 |     "distilbert-base-cased-distilled-squad": 512,
43 |     "distilbert-base-german-cased": 512,
44 |     "distilbert-base-multilingual-cased": 512,
45 | }
46 | 
47 | 
48 | PRETRAINED_INIT_CONFIGURATION = {
49 |     "distilbert-base-uncased": {"do_lower_case": True},
50 |     "distilbert-base-uncased-distilled-squad": {"do_lower_case": True},
51 |     "distilbert-base-cased": {"do_lower_case": False},
52 |     "distilbert-base-cased-distilled-squad": {"do_lower_case": False},
53 |     "distilbert-base-german-cased": {"do_lower_case": False},
54 |     "distilbert-base-multilingual-cased": {"do_lower_case": False},
55 | }
56 | 
57 | 
58 | class DistilBertTokenizer(BertTokenizer):
59 |     r"""
60 |     Constructs a DistilBertTokenizer.
61 |     :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
62 | 
63 |     Args:
64 |         vocab_file: Path to a one-wordpiece-per-line vocabulary file
65 |         do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
66 |         do_basic_tokenize: Whether to do basic tokenization before wordpiece.
67 |         max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
68 |             minimum of this value (if specified) and the underlying BERT model's sequence length.
69 |         never_split: List of tokens which will never be split during tokenization. Only has an effect when
70 |             do_basic_tokenize=True
71 |     """
72 | 
73 |     vocab_files_names = VOCAB_FILES_NAMES
74 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
75 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
76 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
77 | 
78 | 
79 | class DistilBertTokenizerFast(BertTokenizerFast):
80 |     vocab_files_names = VOCAB_FILES_NAMES
81 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
82 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
83 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
84 | 


--------------------------------------------------------------------------------
/language_modeling/examples/distillation/scripts/extract_for_distil.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """
16 | Preprocessing script before training DistilBERT.
17 | """
18 | from pytorch_transformers import BertForPreTraining
19 | import torch
20 | import argparse
21 | 
22 | if __name__ == '__main__':
23 |     parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForPreTraining for Transfer Learned Distillation")
24 |     parser.add_argument("--bert_model", default='bert-base-uncased', type=str)
25 |     parser.add_argument("--dump_checkpoint", default='serialization_dir/transfer_learning_checkpoint_0247911.pth', type=str)
26 |     parser.add_argument("--vocab_transform", action='store_true')
27 |     args = parser.parse_args()
28 | 
29 | 
30 |     model = BertForPreTraining.from_pretrained(args.bert_model)
31 | 
32 |     state_dict = model.state_dict()
33 |     compressed_sd = {}
34 | 
35 |     for w in ['word_embeddings', 'position_embeddings']:
36 |         compressed_sd[f'distilbert.embeddings.{w}.weight'] = \
37 |             state_dict[f'bert.embeddings.{w}.weight']
38 |     for w in ['weight', 'bias']:
39 |         compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \
40 |             state_dict[f'bert.embeddings.LayerNorm.{w}']
41 | 
42 |     std_idx = 0
43 |     for teacher_idx in [0, 2, 4, 7, 9, 11]:
44 |         for w in ['weight', 'bias']:
45 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \
46 |                 state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.query.{w}']
47 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \
48 |                 state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.key.{w}']
49 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \
50 |                 state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.value.{w}']
51 | 
52 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \
53 |                 state_dict[f'bert.encoder.layer.{teacher_idx}.attention.output.dense.{w}']
54 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \
55 |                 state_dict[f'bert.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}']
56 | 
57 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \
58 |                 state_dict[f'bert.encoder.layer.{teacher_idx}.intermediate.dense.{w}']
59 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \
60 |                 state_dict[f'bert.encoder.layer.{teacher_idx}.output.dense.{w}']
61 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \
62 |                 state_dict[f'bert.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
63 |         std_idx += 1
64 | 
65 |     compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
66 |     compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
67 |     if args.vocab_transform:
68 |         for w in ['weight', 'bias']:
69 |             compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
70 |             compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
71 | 
72 |     print(f'N layers selected for distillation: {std_idx}')
73 |     print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
74 | 
75 |     print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
76 |     torch.save(compressed_sd, args.dump_checkpoint)
77 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/modeling_auto_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import unittest
20 | import shutil
21 | import pytest
22 | import logging
23 | 
24 | from pytorch_transformers import (AutoConfig, BertConfig,
25 |                                   AutoModel, BertModel,
26 |                                   AutoModelWithLMHead, BertForMaskedLM,
27 |                                   AutoModelForSequenceClassification, BertForSequenceClassification,
28 |                                   AutoModelForQuestionAnswering, BertForQuestionAnswering)
29 | from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
30 | 
31 | from .modeling_common_test import (CommonTestCases, ids_tensor)
32 | from .configuration_common_test import ConfigTester
33 | 
34 | 
35 | class AutoModelTest(unittest.TestCase):
36 |     def test_model_from_pretrained(self):
37 |         logging.basicConfig(level=logging.INFO)
38 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
39 |             config = AutoConfig.from_pretrained(model_name)
40 |             self.assertIsNotNone(config)
41 |             self.assertIsInstance(config, BertConfig)
42 | 
43 |             model = AutoModel.from_pretrained(model_name)
44 |             model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True)
45 |             self.assertIsNotNone(model)
46 |             self.assertIsInstance(model, BertModel)
47 |             for value in loading_info.values():
48 |                 self.assertEqual(len(value), 0)
49 | 
50 |     def test_lmhead_model_from_pretrained(self):
51 |         logging.basicConfig(level=logging.INFO)
52 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
53 |             config = AutoConfig.from_pretrained(model_name)
54 |             self.assertIsNotNone(config)
55 |             self.assertIsInstance(config, BertConfig)
56 | 
57 |             model = AutoModelWithLMHead.from_pretrained(model_name)
58 |             model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
59 |             self.assertIsNotNone(model)
60 |             self.assertIsInstance(model, BertForMaskedLM)
61 | 
62 |     def test_sequence_classification_model_from_pretrained(self):
63 |         logging.basicConfig(level=logging.INFO)
64 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
65 |             config = AutoConfig.from_pretrained(model_name)
66 |             self.assertIsNotNone(config)
67 |             self.assertIsInstance(config, BertConfig)
68 | 
69 |             model = AutoModelForSequenceClassification.from_pretrained(model_name)
70 |             model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
71 |             self.assertIsNotNone(model)
72 |             self.assertIsInstance(model, BertForSequenceClassification)
73 | 
74 |     def test_question_answering_model_from_pretrained(self):
75 |         logging.basicConfig(level=logging.INFO)
76 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
77 |             config = AutoConfig.from_pretrained(model_name)
78 |             self.assertIsNotNone(config)
79 |             self.assertIsInstance(config, BertConfig)
80 | 
81 |             model = AutoModelForQuestionAnswering.from_pretrained(model_name)
82 |             model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
83 |             self.assertIsNotNone(model)
84 |             self.assertIsInstance(model, BertForQuestionAnswering)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     unittest.main()
89 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/fixtures/sample_text.txt:
--------------------------------------------------------------------------------
 1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
 2 | Text should be one-sentence-per-line, with empty lines between documents.
 3 | This sample text is public domain and was randomly selected from Project Guttenberg.
 4 | 
 5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
 6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
 7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
 8 | "Cass" Beard had risen early that morning, but not with a view to discovery.
 9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
11 | This was nearly opposite.
12 | Mr. Cassius crossed the highway, and stopped suddenly.
13 | Something glittered in the nearest red pool before him.
14 | Gold, surely!
15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
17 | Like most of his fellow gold-seekers, Cass was superstitious.
18 | 
19 | The fountain of classic wisdom, Hypatia herself.
20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
21 | From my youth I felt in me a soul above the matter-entangled herd.
22 | She revealed to me the glorious fact, that I am a spark of Divinity itself.
23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
24 | There is a philosophic pleasure in opening one's treasures to the modest young.
25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
31 | At last they reached the quay at the opposite end of the street;
32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
34 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/tokenization_roberta_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import json
19 | import unittest
20 | from io import open
21 | 
22 | from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
23 | from .tokenization_tests_commons import CommonTestCases
24 | 
25 | 
26 | class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
27 |     tokenizer_class = RobertaTokenizer
28 | 
29 |     def setUp(self):
30 |         super(RobertaTokenizationTest, self).setUp()
31 | 
32 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
33 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
34 |                  "\u0120", "\u0120l", "\u0120n",
35 |                  "\u0120lo", "\u0120low", "er",
36 |                  "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
37 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
38 |         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
39 |         self.special_tokens_map = {"unk_token": "<unk>"}
40 | 
41 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
42 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
43 |         with open(self.vocab_file, "w", encoding="utf-8") as fp:
44 |             fp.write(json.dumps(vocab_tokens) + "\n")
45 |         with open(self.merges_file, "w", encoding="utf-8") as fp:
46 |             fp.write("\n".join(merges))
47 | 
48 |     def get_tokenizer(self, **kwargs):
49 |         kwargs.update(self.special_tokens_map)
50 |         return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)
51 | 
52 |     def get_input_output_texts(self):
53 |         input_text = u"lower newer"
54 |         output_text = u" lower newer"
55 |         return input_text, output_text
56 | 
57 |     def test_full_tokenizer(self):
58 |         tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
59 |         text = "lower newer"
60 |         bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
61 |         tokens = tokenizer.tokenize(text)
62 |         self.assertListEqual(tokens, bpe_tokens)
63 | 
64 |         input_tokens = tokens + [tokenizer.unk_token]
65 |         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
66 |         self.assertListEqual(
67 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
68 | 
69 |     def roberta_dict_integration_testing(self):
70 |         tokenizer = self.get_tokenizer()
71 | 
72 |         self.assertListEqual(
73 |             tokenizer.encode('Hello world!'),
74 |             [0, 31414, 232, 328, 2]
75 |         )
76 |         self.assertListEqual(
77 |             tokenizer.encode('Hello world! cécé herlolip 418'),
78 |             [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
79 |         )
80 | 
81 |     def test_sequence_builders(self):
82 |         tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
83 | 
84 |         text = tokenizer.encode("sequence builders")
85 |         text_2 = tokenizer.encode("multi-sequence build")
86 | 
87 |         encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
88 |         encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
89 | 
90 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
91 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
92 | 
93 |         assert encoded_sentence == encoded_text_from_decode
94 |         assert encoded_pair == encoded_pair_from_decode
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     unittest.main()
99 | 


--------------------------------------------------------------------------------
/language_modeling/examples/test_examples.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 HuggingFace Inc..
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import
 16 | from __future__ import division
 17 | from __future__ import print_function
 18 | 
 19 | import sys
 20 | import unittest
 21 | import argparse
 22 | import logging
 23 | 
 24 | try:
 25 |     # python 3.4+ can use builtin unittest.mock instead of mock package
 26 |     from unittest.mock import patch
 27 | except ImportError:
 28 |     from mock import patch
 29 | 
 30 | import run_glue
 31 | import run_squad
 32 | import run_generation
 33 | 
 34 | logging.basicConfig(level=logging.DEBUG)
 35 | 
 36 | logger = logging.getLogger()
 37 | 
 38 | def get_setup_file():
 39 |     parser = argparse.ArgumentParser()
 40 |     parser.add_argument('-f')
 41 |     args = parser.parse_args()
 42 |     return args.f
 43 | 
 44 | class ExamplesTests(unittest.TestCase):
 45 | 
 46 |     def test_run_glue(self):
 47 |         stream_handler = logging.StreamHandler(sys.stdout)
 48 |         logger.addHandler(stream_handler)
 49 | 
 50 |         testargs = ["run_glue.py",
 51 |                     "--data_dir=./examples/tests_samples/MRPC/",
 52 |                     "--task_name=mrpc",
 53 |                     "--do_train",
 54 |                     "--do_eval",
 55 |                     "--output_dir=./examples/tests_samples/temp_dir",
 56 |                     "--per_gpu_train_batch_size=2",
 57 |                     "--per_gpu_eval_batch_size=1",
 58 |                     "--learning_rate=1e-4",
 59 |                     "--max_steps=10",
 60 |                     "--warmup_steps=2",
 61 |                     "--overwrite_output_dir",
 62 |                     "--seed=42"]
 63 |         model_type, model_name = ("--model_type=bert",
 64 |                                   "--model_name_or_path=bert-base-uncased")
 65 |         with patch.object(sys, 'argv', testargs + [model_type, model_name]):
 66 |             result = run_glue.main()
 67 |             for value in result.values():
 68 |                 self.assertGreaterEqual(value, 0.75)
 69 | 
 70 |     def test_run_squad(self):
 71 |         stream_handler = logging.StreamHandler(sys.stdout)
 72 |         logger.addHandler(stream_handler)
 73 | 
 74 |         testargs = ["run_squad.py",
 75 |                     "--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
 76 |                     "--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
 77 |                     "--model_name=bert-base-uncased",
 78 |                     "--output_dir=./examples/tests_samples/temp_dir",
 79 |                     "--max_steps=10",
 80 |                     "--warmup_steps=2",
 81 |                     "--do_train",
 82 |                     "--do_eval",
 83 |                     "--version_2_with_negative",
 84 |                     "--learning_rate=2e-4",
 85 |                     "--per_gpu_train_batch_size=2",
 86 |                     "--per_gpu_eval_batch_size=1",
 87 |                     "--overwrite_output_dir",
 88 |                     "--seed=42"]
 89 |         model_type, model_name = ("--model_type=bert",
 90 |                                   "--model_name_or_path=bert-base-uncased")
 91 |         with patch.object(sys, 'argv', testargs + [model_type, model_name]):
 92 |             result = run_squad.main()
 93 |             self.assertGreaterEqual(result['f1'], 30)
 94 |             self.assertGreaterEqual(result['exact'], 30)
 95 | 
 96 |     def test_generation(self):
 97 |         stream_handler = logging.StreamHandler(sys.stdout)
 98 |         logger.addHandler(stream_handler)
 99 | 
100 |         testargs = ["run_generation.py",
101 |                     "--prompt=Hello",
102 |                     "--length=10",
103 |                     "--seed=42"]
104 |         model_type, model_name = ("--model_type=openai-gpt",
105 |                                   "--model_name_or_path=openai-gpt")
106 |         with patch.object(sys, 'argv', testargs + [model_type, model_name]):
107 |             result = run_generation.main()
108 |             self.assertGreaterEqual(len(result), 10)
109 | 
110 | if __name__ == "__main__":
111 |     unittest.main()
112 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "1.2.0"
 2 | # Work around to update TensorFlow's absl.logging threshold which alters the
 3 | # default Python logging output behavior when present.
 4 | # see: https://github.com/abseil/abseil-py/issues/99
 5 | # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
 6 | try:
 7 |     import absl.logging
 8 |     absl.logging.set_verbosity('info')
 9 |     absl.logging.set_stderrthreshold('info')
10 |     absl.logging._warn_preinit_stderr = False
11 | except:
12 |     pass
13 | 
14 | # Tokenizer
15 | from .tokenization_utils import (PreTrainedTokenizer)
16 | from .tokenization_auto import AutoTokenizer
17 | from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
18 | from .tokenization_openai import OpenAIGPTTokenizer
19 | from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
20 | from .tokenization_gpt2 import GPT2Tokenizer
21 | from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
22 | from .tokenization_xlm import XLMTokenizer
23 | from .tokenization_roberta import RobertaTokenizer
24 | from .tokenization_distilbert import DistilBertTokenizer
25 | 
26 | # Configurations
27 | from .configuration_utils import PretrainedConfig
28 | from .configuration_auto import AutoConfig
29 | from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
30 | from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
31 | from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
32 | from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
33 | from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
34 | from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
35 | from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
36 | from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
37 | 
38 | # Modeling
39 | from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
40 | from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
41 |                             AutoModelWithLMHead)
42 | 
43 | from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
44 |                             BertForMaskedLM, BertForNextSentencePrediction,
45 |                             BertForSequenceClassification, BertForMultipleChoice,
46 |                             BertForTokenClassification, BertForQuestionAnswering,
47 |                             load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
48 | from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
49 |                               OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
50 |                               load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
51 | from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
52 |                                   load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
53 | from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
54 |                             GPT2LMHeadModel, GPT2DoubleHeadsModel,
55 |                             load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
56 | from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
57 |                              XLNetForSequenceClassification, XLNetForQuestionAnswering,
58 |                              load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
59 | from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
60 |                            XLMWithLMHeadModel, XLMForSequenceClassification,
61 |                            XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
62 | from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
63 |                                ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
64 | from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
65 |                                DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
66 |                                DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
67 | 
68 | # Optimization
69 | from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
70 |                            WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
71 | 
72 | # Files and general utilities
73 | from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
74 |                          cached_path, add_start_docstrings, add_end_docstrings,
75 |                          WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME)
76 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert BERT checkpoint."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import os
 22 | import argparse
 23 | import torch
 24 | 
 25 | from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
 26 |                                                     XLNetConfig,
 27 |                                                     XLNetLMHeadModel, XLNetForQuestionAnswering,
 28 |                                                     XLNetForSequenceClassification,
 29 |                                                     load_tf_weights_in_xlnet)
 30 | 
 31 | GLUE_TASKS_NUM_LABELS = {
 32 |     "cola": 2,
 33 |     "mnli": 3,
 34 |     "mrpc": 2,
 35 |     "sst-2": 2,
 36 |     "sts-b": 1,
 37 |     "qqp": 2,
 38 |     "qnli": 2,
 39 |     "rte": 2,
 40 |     "wnli": 2,
 41 | }
 42 | 
 43 | import logging
 44 | logging.basicConfig(level=logging.INFO)
 45 | 
 46 | def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None):
 47 |     # Initialise PyTorch model
 48 |     config = XLNetConfig.from_json_file(bert_config_file)
 49 | 
 50 |     finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
 51 |     if finetuning_task in GLUE_TASKS_NUM_LABELS:
 52 |         print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
 53 |         config.finetuning_task = finetuning_task
 54 |         config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
 55 |         model = XLNetForSequenceClassification(config)
 56 |     elif 'squad' in finetuning_task:
 57 |         config.finetuning_task = finetuning_task
 58 |         model = XLNetForQuestionAnswering(config)
 59 |     else:
 60 |         model = XLNetLMHeadModel(config)
 61 | 
 62 |     # Load weights from tf checkpoint
 63 |     load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
 64 | 
 65 |     # Save pytorch-model
 66 |     pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
 67 |     pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
 68 |     print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
 69 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
 70 |     print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
 71 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
 72 |         f.write(config.to_json_string())
 73 | 
 74 | 
 75 | if __name__ == "__main__":
 76 |     parser = argparse.ArgumentParser()
 77 |     ## Required parameters
 78 |     parser.add_argument("--tf_checkpoint_path",
 79 |                         default = None,
 80 |                         type = str,
 81 |                         required = True,
 82 |                         help = "Path to the TensorFlow checkpoint path.")
 83 |     parser.add_argument("--xlnet_config_file",
 84 |                         default = None,
 85 |                         type = str,
 86 |                         required = True,
 87 |                         help = "The config json file corresponding to the pre-trained XLNet model. \n"
 88 |                                "This specifies the model architecture.")
 89 |     parser.add_argument("--pytorch_dump_folder_path",
 90 |                         default = None,
 91 |                         type = str,
 92 |                         required = True,
 93 |                         help = "Path to the folder to store the PyTorch model or dataset/vocab.")
 94 |     parser.add_argument("--finetuning_task",
 95 |                         default = None,
 96 |                         type = str,
 97 |                         help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned")
 98 |     args = parser.parse_args()
 99 |     print(args)
100 | 
101 |     convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path,
102 |                                         args.xlnet_config_file,
103 |                                         args.pytorch_dump_folder_path,
104 |                                         args.finetuning_task)
105 | 


--------------------------------------------------------------------------------
/language_modeling/examples/distillation/utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Utils to train DistilBERT
 16 |     adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 17 | """
 18 | import git
 19 | import json
 20 | import os
 21 | import socket
 22 | import torch
 23 | import numpy as np
 24 | 
 25 | import logging
 26 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s',
 27 |                     datefmt = '%m/%d/%Y %H:%M:%S',
 28 |                     level = logging.INFO)
 29 | logger = logging.getLogger(__name__)
 30 | 
 31 | 
 32 | def git_log(folder_path: str):
 33 |     """
 34 |     Log commit info.
 35 |     """
 36 |     repo = git.Repo(search_parent_directories=True)
 37 |     repo_infos = {
 38 |         'repo_id': str(repo),
 39 |         'repo_sha': str(repo.head.object.hexsha),
 40 |         'repo_branch': str(repo.active_branch)
 41 |     }
 42 | 
 43 |     with open(os.path.join(folder_path, 'git_log.json'), 'w') as f:
 44 |         json.dump(repo_infos, f, indent=4)
 45 | 
 46 | 
 47 | def init_gpu_params(params):
 48 |     """
 49 |     Handle single and multi-GPU / multi-node.
 50 |     """
 51 |     if params.n_gpu <= 0:
 52 |         params.local_rank = 0
 53 |         params.master_port = -1
 54 |         params.is_master = True
 55 |         params.multi_gpu = False
 56 |         return
 57 | 
 58 |     assert torch.cuda.is_available()
 59 | 
 60 |     logger.info('Initializing GPUs')
 61 |     if params.n_gpu > 1:
 62 |         assert params.local_rank != -1
 63 | 
 64 |         params.world_size = int(os.environ['WORLD_SIZE'])
 65 |         params.n_gpu_per_node = int(os.environ['N_GPU_NODE'])
 66 |         params.global_rank = int(os.environ['RANK'])
 67 | 
 68 |         # number of nodes / node ID
 69 |         params.n_nodes = params.world_size // params.n_gpu_per_node
 70 |         params.node_id = params.global_rank // params.n_gpu_per_node
 71 |         params.multi_gpu = True
 72 | 
 73 |         assert params.n_nodes == int(os.environ['N_NODES'])
 74 |         assert params.node_id == int(os.environ['NODE_RANK'])
 75 | 
 76 |     # local job (single GPU)
 77 |     else:
 78 |         assert params.local_rank == -1
 79 | 
 80 |         params.n_nodes = 1
 81 |         params.node_id = 0
 82 |         params.local_rank = 0
 83 |         params.global_rank = 0
 84 |         params.world_size = 1
 85 |         params.n_gpu_per_node = 1
 86 |         params.multi_gpu = False
 87 | 
 88 |     # sanity checks
 89 |     assert params.n_nodes >= 1
 90 |     assert 0 <= params.node_id < params.n_nodes
 91 |     assert 0 <= params.local_rank <= params.global_rank < params.world_size
 92 |     assert params.world_size == params.n_nodes * params.n_gpu_per_node
 93 | 
 94 |     # define whether this is the master process / if we are in multi-node distributed mode
 95 |     params.is_master = params.node_id == 0 and params.local_rank == 0
 96 |     params.multi_node = params.n_nodes > 1
 97 | 
 98 |     # summary
 99 |     PREFIX = f"--- Global rank: {params.global_rank} - "
100 |     logger.info(PREFIX + "Number of nodes: %i" % params.n_nodes)
101 |     logger.info(PREFIX + "Node ID        : %i" % params.node_id)
102 |     logger.info(PREFIX + "Local rank     : %i" % params.local_rank)
103 |     logger.info(PREFIX + "World size     : %i" % params.world_size)
104 |     logger.info(PREFIX + "GPUs per node  : %i" % params.n_gpu_per_node)
105 |     logger.info(PREFIX + "Master         : %s" % str(params.is_master))
106 |     logger.info(PREFIX + "Multi-node     : %s" % str(params.multi_node))
107 |     logger.info(PREFIX + "Multi-GPU      : %s" % str(params.multi_gpu))
108 |     logger.info(PREFIX + "Hostname       : %s" % socket.gethostname())
109 | 
110 |     # set GPU device
111 |     torch.cuda.set_device(params.local_rank)
112 | 
113 |     # initialize multi-GPU
114 |     if params.multi_gpu:
115 |         logger.info("Initializing PyTorch distributed")
116 |         torch.distributed.init_process_group(
117 |             init_method='env://',
118 |             backend='nccl',
119 |         )
120 | 
121 | 
122 | def set_seed(args):
123 |     """
124 |     Set the random seed.
125 |     """
126 |     np.random.seed(args.seed)
127 |     torch.manual_seed(args.seed)
128 |     if args.n_gpu > 0:
129 |         torch.cuda.manual_seed_all(args.seed)
130 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""
 17 | 
 18 | import os
 19 | import argparse
 20 | import torch
 21 | import numpy as np
 22 | import tensorflow as tf
 23 | from pytorch_transformers import BertModel
 24 | 
 25 | 
 26 | def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
 27 | 
 28 |     """
 29 |     :param model:BertModel Pytorch model instance to be converted
 30 |     :param ckpt_dir: Tensorflow model directory
 31 |     :param model_name: model name
 32 |     :return:
 33 | 
 34 |     Currently supported HF models:
 35 |         Y BertModel
 36 |         N BertForMaskedLM
 37 |         N BertForPreTraining
 38 |         N BertForMultipleChoice
 39 |         N BertForNextSentencePrediction
 40 |         N BertForSequenceClassification
 41 |         N BertForQuestionAnswering
 42 |     """
 43 | 
 44 |     tensors_to_transpose = (
 45 |         "dense.weight",
 46 |         "attention.self.query",
 47 |         "attention.self.key",
 48 |         "attention.self.value"
 49 |     )
 50 | 
 51 |     var_map = (
 52 |         ('layer.', 'layer_'),
 53 |         ('word_embeddings.weight', 'word_embeddings'),
 54 |         ('position_embeddings.weight', 'position_embeddings'),
 55 |         ('token_type_embeddings.weight', 'token_type_embeddings'),
 56 |         ('.', '/'),
 57 |         ('LayerNorm/weight', 'LayerNorm/gamma'),
 58 |         ('LayerNorm/bias', 'LayerNorm/beta'),
 59 |         ('weight', 'kernel')
 60 |     )
 61 | 
 62 |     if not os.path.isdir(ckpt_dir):
 63 |         os.makedirs(ckpt_dir)
 64 | 
 65 |     state_dict = model.state_dict()
 66 | 
 67 |     def to_tf_var_name(name:str):
 68 |         for patt, repl in iter(var_map):
 69 |             name = name.replace(patt, repl)
 70 |         return 'bert/{}'.format(name)
 71 | 
 72 |     def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session):
 73 |         tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
 74 |         tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
 75 |         session.run(tf.variables_initializer([tf_var]))
 76 |         session.run(tf_var)
 77 |         return tf_var
 78 | 
 79 |     tf.reset_default_graph()
 80 |     with tf.Session() as session:
 81 |         for var_name in state_dict:
 82 |             tf_name = to_tf_var_name(var_name)
 83 |             torch_tensor = state_dict[var_name].numpy()
 84 |             if any([x in var_name for x in tensors_to_transpose]):
 85 |                 torch_tensor = torch_tensor.T
 86 |             tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
 87 |             tf.keras.backend.set_value(tf_var, torch_tensor)
 88 |             tf_weight = session.run(tf_var)
 89 |             print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
 90 | 
 91 |         saver = tf.train.Saver(tf.trainable_variables())
 92 |         saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
 93 | 
 94 | 
 95 | def main(raw_args=None):
 96 |     parser = argparse.ArgumentParser()
 97 |     parser.add_argument("--model_name",
 98 |                         type=str,
 99 |                         required=True,
100 |                         help="model name e.g. bert-base-uncased")
101 |     parser.add_argument("--cache_dir",
102 |                         type=str,
103 |                         default=None,
104 |                         required=False,
105 |                         help="Directory containing pytorch model")
106 |     parser.add_argument("--pytorch_model_path",
107 |                         type=str,
108 |                         required=True,
109 |                         help="/path/to/<pytorch-model-name>.bin")
110 |     parser.add_argument("--tf_cache_dir",
111 |                         type=str,
112 |                         required=True,
113 |                         help="Directory in which to save tensorflow model")
114 |     args = parser.parse_args(raw_args)
115 |     
116 |     model = BertModel.from_pretrained(
117 |         pretrained_model_name_or_path=args.model_name,
118 |         state_dict=torch.load(args.pytorch_model_path),
119 |         cache_dir=args.cache_dir
120 |     )
121 |     
122 |     convert_pytorch_checkpoint_to_tf(
123 |         model=model,
124 |         ckpt_dir=args.tf_cache_dir,
125 |         model_name=args.model_name
126 |     )
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     main()
131 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/tokenization_xlnet_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import, division, print_function, unicode_literals
 16 | 
 17 | import os
 18 | import unittest
 19 | 
 20 | from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
 21 | 
 22 | from .tokenization_tests_commons import CommonTestCases
 23 | 
 24 | SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
 25 |                     'fixtures/test_sentencepiece.model')
 26 | 
 27 | class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
 28 | 
 29 |     tokenizer_class = XLNetTokenizer
 30 | 
 31 |     def setUp(self):
 32 |         super(XLNetTokenizationTest, self).setUp()
 33 | 
 34 |         # We have a SentencePiece fixture for testing
 35 |         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
 36 |         tokenizer.save_pretrained(self.tmpdirname)
 37 | 
 38 |     def get_tokenizer(self, **kwargs):
 39 |         return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 40 | 
 41 |     def get_input_output_texts(self):
 42 |         input_text = u"This is a test"
 43 |         output_text = u"This is a test"
 44 |         return input_text, output_text
 45 | 
 46 | 
 47 |     def test_full_tokenizer(self):
 48 |         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
 49 | 
 50 |         tokens = tokenizer.tokenize(u'This is a test')
 51 |         self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
 52 | 
 53 |         self.assertListEqual(
 54 |             tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
 55 | 
 56 |         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
 57 |         self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
 58 |                                     u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
 59 |                                     u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
 60 |                                     SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
 61 |         ids = tokenizer.convert_tokens_to_ids(tokens)
 62 |         self.assertListEqual(
 63 |             ids, [8, 21, 84, 55, 24, 19, 7, 0,
 64 |                 602, 347, 347, 347, 3, 12, 66,
 65 |                 46, 72, 80, 6, 0, 4])
 66 | 
 67 |         back_tokens = tokenizer.convert_ids_to_tokens(ids)
 68 |         self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
 69 |                                         u'or', u'n', SPIECE_UNDERLINE + u'in',
 70 |                                         SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
 71 |                                         SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
 72 |                                         SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
 73 |                                         u'<unk>', u'.'])
 74 | 
 75 |     def test_tokenizer_lower(self):
 76 |         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
 77 |         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
 78 |         self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'', u'i', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
 79 |                                       u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
 80 |                                       u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
 81 |                                       SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
 82 |         self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), [u"▁he", u"ll", u"o"])
 83 | 
 84 |     def test_tokenizer_no_lower(self):
 85 |         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
 86 |         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
 87 |         self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or',
 88 |                                       u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
 89 |                                       u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
 90 |                                       SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
 91 | 
 92 |     def test_sequence_builders(self):
 93 |         tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
 94 | 
 95 |         text = tokenizer.encode("sequence builders")
 96 |         text_2 = tokenizer.encode("multi-sequence build")
 97 | 
 98 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
 99 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
100 | 
101 |         assert encoded_sentence == text + [4, 3]
102 |         assert encoded_pair == text + [4] + text_2 + [4, 3]
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     unittest.main()
107 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/configuration_openai.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ OpenAI GPT configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"
 31 | }
 32 | 
 33 | class OpenAIGPTConfig(PretrainedConfig):
 34 |     """
 35 |     Configuration class to store the configuration of a `OpenAIGPTModel`.
 36 | 
 37 |     Args:
 38 |         vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
 39 |         n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
 40 |         n_positions: Number of positional embeddings.
 41 |         n_ctx: Size of the causal mask (usually same as n_positions).
 42 |         n_embd: Dimensionality of the embeddings and hidden states.
 43 |         n_layer: Number of hidden layers in the Transformer encoder.
 44 |         n_head: Number of attention heads for each attention layer in
 45 |             the Transformer encoder.
 46 |         afn: The non-linear activation function (function or string) in the
 47 |             encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 48 |         resid_pdrop: The dropout probabilitiy for all fully connected
 49 |             layers in the embeddings, encoder, and pooler.
 50 |         attn_pdrop: The dropout ratio for the attention
 51 |             probabilities.
 52 |         embd_pdrop: The dropout ratio for the embeddings.
 53 |         layer_norm_epsilon: epsilon to use in the layer norm layers
 54 |         initializer_range: The sttdev of the truncated_normal_initializer for
 55 |             initializing all weight matrices.
 56 |         predict_special_tokens: should we predict special tokens (when the model has a LM head)
 57 |     """
 58 |     pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 59 | 
 60 |     def __init__(
 61 |         self,
 62 |         vocab_size_or_config_json_file=40478,
 63 |         n_positions=512,
 64 |         n_ctx=512,
 65 |         n_embd=768,
 66 |         n_layer=12,
 67 |         n_head=12,
 68 |         afn="gelu",
 69 |         resid_pdrop=0.1,
 70 |         embd_pdrop=0.1,
 71 |         attn_pdrop=0.1,
 72 |         layer_norm_epsilon=1e-5,
 73 |         initializer_range=0.02,
 74 |         predict_special_tokens=True,
 75 | 
 76 |         num_labels=1,
 77 |         summary_type='cls_index',
 78 |         summary_use_proj=True,
 79 |         summary_activation=None,
 80 |         summary_proj_to_labels=True,
 81 |         summary_first_dropout=0.1,
 82 |         **kwargs
 83 |     ):
 84 |         """Constructs OpenAIGPTConfig.
 85 |         """
 86 |         super(OpenAIGPTConfig, self).__init__(**kwargs)
 87 | 
 88 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
 89 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
 90 |             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
 91 |                 json_config = json.loads(reader.read())
 92 |             for key, value in json_config.items():
 93 |                 self.__dict__[key] = value
 94 |         elif isinstance(vocab_size_or_config_json_file, int):
 95 |             self.vocab_size = vocab_size_or_config_json_file
 96 |             self.n_ctx = n_ctx
 97 |             self.n_positions = n_positions
 98 |             self.n_embd = n_embd
 99 |             self.n_layer = n_layer
100 |             self.n_head = n_head
101 |             self.afn = afn
102 |             self.resid_pdrop = resid_pdrop
103 |             self.embd_pdrop = embd_pdrop
104 |             self.attn_pdrop = attn_pdrop
105 |             self.layer_norm_epsilon = layer_norm_epsilon
106 |             self.initializer_range = initializer_range
107 |             self.predict_special_tokens = predict_special_tokens
108 | 
109 |             self.num_labels = num_labels
110 |             self.summary_type = summary_type
111 |             self.summary_use_proj = summary_use_proj
112 |             self.summary_activation = summary_activation
113 |             self.summary_first_dropout = summary_first_dropout
114 |             self.summary_proj_to_labels = summary_proj_to_labels
115 |         else:
116 |             raise ValueError(
117 |                 "First argument must be either a vocabulary size (int)"
118 |                 "or the path to a pretrained model config file (str)"
119 |             )
120 | 
121 |     @property
122 |     def max_position_embeddings(self):
123 |         return self.n_positions
124 | 
125 |     @property
126 |     def hidden_size(self):
127 |         return self.n_embd
128 | 
129 |     @property
130 |     def num_attention_heads(self):
131 |         return self.n_head
132 | 
133 |     @property
134 |     def num_hidden_layers(self):
135 |         return self.n_layer
136 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/tokenization_bert_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import, division, print_function, unicode_literals
 16 | 
 17 | import os
 18 | import unittest
 19 | from io import open
 20 | 
 21 | from pytorch_transformers.tokenization_bert import (BasicTokenizer,
 22 |                                                     BertTokenizer,
 23 |                                                     WordpieceTokenizer,
 24 |                                                     _is_control, _is_punctuation,
 25 |                                                     _is_whitespace, VOCAB_FILES_NAMES)
 26 | 
 27 | from .tokenization_tests_commons import CommonTestCases
 28 | 
 29 | class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
 30 | 
 31 |     tokenizer_class = BertTokenizer
 32 | 
 33 |     def setUp(self):
 34 |         super(BertTokenizationTest, self).setUp()
 35 | 
 36 |         vocab_tokens = [
 37 |             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
 38 |             "##ing", ",", "low", "lowest",
 39 |         ]
 40 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
 41 |         with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
 42 |             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 43 | 
 44 |     def get_tokenizer(self, **kwargs):
 45 |         return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 46 | 
 47 |     def get_input_output_texts(self):
 48 |         input_text = u"UNwant\u00E9d,running"
 49 |         output_text = u"unwanted, running"
 50 |         return input_text, output_text
 51 | 
 52 |     def test_full_tokenizer(self):
 53 |         tokenizer = self.tokenizer_class(self.vocab_file)
 54 | 
 55 |         tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
 56 |         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
 57 |         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 58 | 
 59 |     def test_chinese(self):
 60 |         tokenizer = BasicTokenizer()
 61 | 
 62 |         self.assertListEqual(
 63 |             tokenizer.tokenize(u"ah\u535A\u63A8zz"),
 64 |             [u"ah", u"\u535A", u"\u63A8", u"zz"])
 65 | 
 66 |     def test_basic_tokenizer_lower(self):
 67 |         tokenizer = BasicTokenizer(do_lower_case=True)
 68 | 
 69 |         self.assertListEqual(
 70 |             tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
 71 |             ["hello", "!", "how", "are", "you", "?"])
 72 |         self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
 73 | 
 74 |     def test_basic_tokenizer_no_lower(self):
 75 |         tokenizer = BasicTokenizer(do_lower_case=False)
 76 | 
 77 |         self.assertListEqual(
 78 |             tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
 79 |             ["HeLLo", "!", "how", "Are", "yoU", "?"])
 80 | 
 81 |     def test_wordpiece_tokenizer(self):
 82 |         vocab_tokens = [
 83 |             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
 84 |             "##ing"
 85 |         ]
 86 | 
 87 |         vocab = {}
 88 |         for (i, token) in enumerate(vocab_tokens):
 89 |             vocab[token] = i
 90 |         tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
 91 | 
 92 |         self.assertListEqual(tokenizer.tokenize(""), [])
 93 | 
 94 |         self.assertListEqual(
 95 |             tokenizer.tokenize("unwanted running"),
 96 |             ["un", "##want", "##ed", "runn", "##ing"])
 97 | 
 98 |         self.assertListEqual(
 99 |             tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
100 | 
101 |     def test_is_whitespace(self):
102 |         self.assertTrue(_is_whitespace(u" "))
103 |         self.assertTrue(_is_whitespace(u"\t"))
104 |         self.assertTrue(_is_whitespace(u"\r"))
105 |         self.assertTrue(_is_whitespace(u"\n"))
106 |         self.assertTrue(_is_whitespace(u"\u00A0"))
107 | 
108 |         self.assertFalse(_is_whitespace(u"A"))
109 |         self.assertFalse(_is_whitespace(u"-"))
110 | 
111 |     def test_is_control(self):
112 |         self.assertTrue(_is_control(u"\u0005"))
113 | 
114 |         self.assertFalse(_is_control(u"A"))
115 |         self.assertFalse(_is_control(u" "))
116 |         self.assertFalse(_is_control(u"\t"))
117 |         self.assertFalse(_is_control(u"\r"))
118 | 
119 |     def test_is_punctuation(self):
120 |         self.assertTrue(_is_punctuation(u"-"))
121 |         self.assertTrue(_is_punctuation(u"$"))
122 |         self.assertTrue(_is_punctuation(u"`"))
123 |         self.assertTrue(_is_punctuation(u"."))
124 | 
125 |         self.assertFalse(_is_punctuation(u"A"))
126 |         self.assertFalse(_is_punctuation(u" "))
127 | 
128 |     def test_sequence_builders(self):
129 |         tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
130 | 
131 |         text = tokenizer.encode("sequence builders")
132 |         text_2 = tokenizer.encode("multi-sequence build")
133 | 
134 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
135 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
136 | 
137 |         assert encoded_sentence == [101] + text + [102]
138 |         assert encoded_pair == [101] + text + [102] + text_2 + [102]
139 | 
140 | if __name__ == '__main__':
141 |     unittest.main()
142 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tokenization_flaubert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes for Flaubert, based on XLM."""
 16 | 
 17 | 
 18 | import logging
 19 | import unicodedata
 20 | 
 21 | import six
 22 | 
 23 | from .tokenization_xlm import XLMTokenizer
 24 | 
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | VOCAB_FILES_NAMES = {
 29 |     "vocab_file": "vocab.json",
 30 |     "merges_file": "merges.txt",
 31 | }
 32 | 
 33 | PRETRAINED_VOCAB_FILES_MAP = {
 34 |     "vocab_file": {
 35 |         "flaubert-small-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/vocab.json",
 36 |         "flaubert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/vocab.json",
 37 |         "flaubert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/vocab.json",
 38 |         "flaubert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/vocab.json",
 39 |     },
 40 |     "merges_file": {
 41 |         "flaubert-small-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/merges.txt",
 42 |         "flaubert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/merges.txt",
 43 |         "flaubert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/merges.txt",
 44 |         "flaubert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/merges.txt",
 45 |     },
 46 | }
 47 | 
 48 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 49 |     "flaubert-small-cased": 512,
 50 |     "flaubert-base-uncased": 512,
 51 |     "flaubert-base-cased": 512,
 52 |     "flaubert-large-cased": 512,
 53 | }
 54 | 
 55 | PRETRAINED_INIT_CONFIGURATION = {
 56 |     "flaubert-small-cased": {"do_lowercase": False},
 57 |     "flaubert-base-uncased": {"do_lowercase": True},
 58 |     "flaubert-base-cased": {"do_lowercase": False},
 59 |     "flaubert-large-cased": {"do_lowercase": False},
 60 | }
 61 | 
 62 | 
 63 | def convert_to_unicode(text):
 64 |     """
 65 |     Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
 66 |     """
 67 |     # six_ensure_text is copied from https://github.com/benjaminp/six
 68 |     def six_ensure_text(s, encoding="utf-8", errors="strict"):
 69 |         if isinstance(s, six.binary_type):
 70 |             return s.decode(encoding, errors)
 71 |         elif isinstance(s, six.text_type):
 72 |             return s
 73 |         else:
 74 |             raise TypeError("not expecting type '%s'" % type(s))
 75 | 
 76 |     return six_ensure_text(text, encoding="utf-8", errors="ignore")
 77 | 
 78 | 
 79 | class FlaubertTokenizer(XLMTokenizer):
 80 |     """
 81 |     BPE tokenizer for Flaubert
 82 | 
 83 |         - Moses preprocessing & tokenization
 84 | 
 85 |         - Normalize all inputs text
 86 | 
 87 |         - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
 88 |         (ex: "__classify__") to a vocabulary
 89 | 
 90 |         - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
 91 |     """
 92 | 
 93 |     vocab_files_names = VOCAB_FILES_NAMES
 94 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
 95 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
 96 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 97 | 
 98 |     def __init__(self, do_lowercase=False, **kwargs):
 99 |         super().__init__(**kwargs)
100 |         self.do_lowercase = do_lowercase
101 |         self.do_lowercase_and_remove_accent = False
102 | 
103 |     def preprocess_text(self, text):
104 |         text = text.replace("``", '"').replace("''", '"')
105 |         text = convert_to_unicode(text)
106 |         text = unicodedata.normalize("NFC", text)
107 | 
108 |         if self.do_lowercase:
109 |             text = text.lower()
110 | 
111 |         return text
112 | 
113 |     def _tokenize(self, text, bypass_tokenizer=False):
114 |         """
115 |         Tokenize a string given language code using Moses.
116 | 
117 |         Details of tokenization:
118 |         - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
119 |             - Install with `pip install sacremoses`
120 | 
121 |         Args:
122 |             - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)  (bool). If True, we only apply BPE.
123 | 
124 |         Returns:
125 |             List of tokens.
126 |         """
127 |         lang = "fr"
128 |         if lang and self.lang2id and lang not in self.lang2id:
129 |             logger.error(
130 |                 "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model."
131 |             )
132 | 
133 |         if bypass_tokenizer:
134 |             text = text.split()
135 |         else:
136 |             text = self.preprocess_text(text)
137 |             text = self.moses_pipeline(text, lang=lang)
138 |             text = self.moses_tokenize(text, lang=lang)
139 | 
140 |         split_tokens = []
141 |         for token in text:
142 |             if token:
143 |                 split_tokens.extend([t for t in self.bpe(token).split(" ")])
144 | 
145 |         return split_tokens
146 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert Transformer XL checkpoint and datasets."""
 16 | 
 17 | from __future__ import absolute_import, division, print_function
 18 | 
 19 | import argparse
 20 | import os
 21 | import sys
 22 | from io import open
 23 | 
 24 | import torch
 25 | 
 26 | import pytorch_transformers.tokenization_transfo_xl as data_utils
 27 | 
 28 | from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
 29 | from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
 30 |                                                       load_tf_weights_in_transfo_xl)
 31 | from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
 32 | 
 33 | if sys.version_info[0] == 2:
 34 |     import cPickle as pickle
 35 | else:
 36 |     import pickle
 37 | 
 38 | import logging
 39 | logging.basicConfig(level=logging.INFO)
 40 | 
 41 | # We do this to be able to load python 2 datasets pickles
 42 | # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
 43 | data_utils.Vocab = data_utils.TransfoXLTokenizer
 44 | data_utils.Corpus = data_utils.TransfoXLCorpus
 45 | sys.modules['data_utils'] = data_utils
 46 | sys.modules['vocabulary'] = data_utils
 47 | 
 48 | def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
 49 |                                              transfo_xl_config_file,
 50 |                                              pytorch_dump_folder_path,
 51 |                                              transfo_xl_dataset_file):
 52 |     if transfo_xl_dataset_file:
 53 |         # Convert a pre-processed corpus (see original TensorFlow repo)
 54 |         with open(transfo_xl_dataset_file, "rb") as fp:
 55 |             corpus = pickle.load(fp, encoding="latin1")
 56 |         # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
 57 |         pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['pretrained_vocab_file']
 58 |         print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
 59 |         corpus_vocab_dict = corpus.vocab.__dict__
 60 |         torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
 61 | 
 62 |         corpus_dict_no_vocab = corpus.__dict__
 63 |         corpus_dict_no_vocab.pop('vocab', None)
 64 |         pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME
 65 |         print("Save dataset to {}".format(pytorch_dataset_dump_path))
 66 |         torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
 67 | 
 68 |     if tf_checkpoint_path:
 69 |         # Convert a pre-trained TensorFlow model
 70 |         config_path = os.path.abspath(transfo_xl_config_file)
 71 |         tf_path = os.path.abspath(tf_checkpoint_path)
 72 | 
 73 |         print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
 74 |         # Initialise PyTorch model
 75 |         if transfo_xl_config_file == "":
 76 |             config = TransfoXLConfig()
 77 |         else:
 78 |             config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
 79 |         print("Building PyTorch model from configuration: {}".format(str(config)))
 80 |         model = TransfoXLLMHeadModel(config)
 81 | 
 82 |         model = load_tf_weights_in_transfo_xl(model, config, tf_path)
 83 |         # Save pytorch-model
 84 |         pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
 85 |         pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
 86 |         print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
 87 |         torch.save(model.state_dict(), pytorch_weights_dump_path)
 88 |         print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
 89 |         with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
 90 |             f.write(config.to_json_string())
 91 | 
 92 | 
 93 | if __name__ == "__main__":
 94 |     parser = argparse.ArgumentParser()
 95 |     parser.add_argument("--pytorch_dump_folder_path",
 96 |                         default = None,
 97 |                         type = str,
 98 |                         required = True,
 99 |                         help = "Path to the folder to store the PyTorch model or dataset/vocab.")
100 |     parser.add_argument("--tf_checkpoint_path",
101 |                         default = "",
102 |                         type = str,
103 |                         help = "An optional path to a TensorFlow checkpoint path to be converted.")
104 |     parser.add_argument("--transfo_xl_config_file",
105 |                         default = "",
106 |                         type = str,
107 |                         help = "An optional config json file corresponding to the pre-trained BERT model. \n"
108 |                             "This specifies the model architecture.")
109 |     parser.add_argument("--transfo_xl_dataset_file",
110 |                         default = "",
111 |                         type = str,
112 |                         help = "An optional dataset file to be converted in a vocabulary.")
113 |     args = parser.parse_args()
114 |     convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
115 |                                      args.transfo_xl_config_file,
116 |                                      args.pytorch_dump_folder_path,
117 |                                      args.transfo_xl_dataset_file)
118 | 


--------------------------------------------------------------------------------
/language_modeling/examples/lm_finetuning/README.md:
--------------------------------------------------------------------------------
 1 | # BERT Model Finetuning using Masked Language Modeling objective
 2 | 
 3 | ## Introduction
 4 | 
 5 | The three example scripts in this folder can be used to **fine-tune** a pre-trained BERT model using the pretraining objective (combination of masked language modeling and next sentence prediction loss). In general, pretrained models like BERT are first trained with a pretraining objective (masked language modeling and next sentence prediction for BERT) on a large and general natural language corpus. A classifier head is then added on top of the pre-trained architecture and the model is quickly fine-tuned on a target task, while still (hopefully) retaining its general language understanding. This greatly reduces overfitting and yields state-of-the-art results, especially when training data for the target task are limited.
 6 | 
 7 | The [ULMFiT paper](https://arxiv.org/abs/1801.06146) took a slightly different approach, however, and added an intermediate step in which the model is fine-tuned on text **from the same domain as the target task and using the pretraining objective** before the final stage in which the classifier head is added and the model is trained on the target task itself. This paper reported significantly improved results from this step, and found that they could get high-quality classifications even with only tiny numbers (<1000) of labelled training examples, as long as they had a lot of unlabelled data from the target domain.
 8 | 
 9 | Although this wasn't covered in the original BERT paper, domain-specific fine-tuning of Transformer models has [recently been reported by other authors](https://arxiv.org/pdf/1905.05583.pdf), and they report performance improvements as well.
10 | 
11 | ## Input format
12 | 
13 | The scripts in this folder expect a single file as input, consisting of untokenized text, with one **sentence** per line, and one blank line between documents. The reason for the sentence splitting is that part of BERT's training involves a _next sentence_ objective in which the model must predict whether two sequences of text are contiguous text from the same document or not, and to avoid making the task _too easy_, the split point between the sequences is always at the end of a sentence. The linebreaks in the file are therefore necessary to mark the points where the text can be split.
14 | 
15 | ## Usage
16 | 
17 | There are two ways to fine-tune a language model using these scripts. The first _quick_ approach is to use [`simple_lm_finetuning.py`](./simple_lm_finetuning.py). This script does everything in a single script, but generates training instances that consist of just two sentences. This is quite different from the BERT paper, where (confusingly) the NextSentence task concatenated sentences together from each document to form two long multi-sentences, which the paper just referred to as _sentences_. The difference between this simple approach and the original paper approach can have a significant effect for long sequences since two sentences will be much shorter than the max sequence length. In this case, most of each training example will just consist of blank padding characters, which wastes a lot of computation and results in a model that isn't really training on long sequences.
18 | 
19 | As such, the preferred approach (assuming you have documents containing multiple contiguous sentences from your target domain) is to use [`pregenerate_training_data.py`](./pregenerate_training_data.py) to pre-process your data into training examples following the methodology used for LM training in the original BERT paper and repository. Since there is a significant random component to training data generation for BERT, this script includes an option to generate multiple _epochs_ of pre-processed data, to avoid training on the same random splits each epoch. Generating an epoch of data for each training epoch should result a better final model, and so we recommend doing so.
20 | 
21 | You can then train on the pregenerated data using [`finetune_on_pregenerated.py`](./finetune_on_pregenerated.py), and pointing it to the folder created by [`pregenerate_training_data.py`](./pregenerate_training_data.py). Note that you should use the same `bert_model` and case options for both! Also note that `max_seq_len` does not need to be specified for the [`finetune_on_pregenerated.py`](./finetune_on_pregenerated.py) script, as it is inferred from the training examples.
22 | 
23 | There are various options that can be tweaked, but they are mostly set to the values from the BERT paper/repository and default values should make sense. The most relevant ones are:
24 | 
25 | - `--max_seq_len`: Controls the length of training examples (in wordpiece tokens) seen by the model. Defaults to 128 but can be set as high as 512. Higher values may yield stronger language models at the cost of slower and more memory-intensive training.
26 | - `--fp16`: Enables fast half-precision training on recent GPUs.
27 | 
28 | In addition, if memory usage is an issue, especially when training on a single GPU, reducing `--train_batch_size` from the default 32 to a lower number (4-16) can be helpful, or leaving `--train_batch_size` at the default and increasing `--gradient_accumulation_steps` to 2-8. Changing `--gradient_accumulation_steps` may be preferable as alterations to the batch size may require corresponding changes in the learning rate to compensate. There is also a `--reduce_memory` option for both the `pregenerate_training_data.py` and `finetune_on_pregenerated.py` scripts that spills data to disc in shelf objects or numpy memmaps rather than retaining it in memory, which significantly reduces memory usage with little performance impact.
29 | 
30 | ## Examples
31 | 
32 | ### Simple fine-tuning
33 | 
34 | ```
35 | python3 simple_lm_finetuning.py 
36 | --train_corpus my_corpus.txt 
37 | --bert_model bert-base-uncased 
38 | --do_lower_case 
39 | --output_dir finetuned_lm/
40 | --do_train
41 | ```
42 | 
43 | ### Pregenerating training data
44 | 
45 | ```
46 | python3 pregenerate_training_data.py
47 | --train_corpus my_corpus.txt
48 | --bert_model bert-base-uncased
49 | --do_lower_case
50 | --output_dir training/
51 | --epochs_to_generate 3
52 | --max_seq_len 256
53 | ```
54 | 
55 | ### Training on pregenerated data
56 | 
57 | ```
58 | python3 finetune_on_pregenerated.py
59 | --pregenerated_data training/
60 | --bert_model bert-base-uncased
61 | --do_lower_case
62 | --output_dir finetuned_lm/
63 | --epochs 3
64 | ```
65 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/configuration_gpt2.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ OpenAI GPT-2 configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
 30 |                                       "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
 31 |                                       "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
 32 | 
 33 | class GPT2Config(PretrainedConfig):
 34 |     """Configuration class to store the configuration of a `GPT2Model`.
 35 | 
 36 |     Args:
 37 |         vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
 38 |         n_positions: Number of positional embeddings.
 39 |         n_ctx: Size of the causal mask (usually same as n_positions).
 40 |         n_embd: Dimensionality of the embeddings and hidden states.
 41 |         n_layer: Number of hidden layers in the Transformer encoder.
 42 |         n_head: Number of attention heads for each attention layer in
 43 |             the Transformer encoder.
 44 |         layer_norm_epsilon: epsilon to use in the layer norm layers
 45 |         resid_pdrop: The dropout probabilitiy for all fully connected
 46 |             layers in the embeddings, encoder, and pooler.
 47 |         attn_pdrop: The dropout ratio for the attention
 48 |             probabilities.
 49 |         embd_pdrop: The dropout ratio for the embeddings.
 50 |         initializer_range: The sttdev of the truncated_normal_initializer for
 51 |             initializing all weight matrices.
 52 |     """
 53 |     pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         vocab_size_or_config_json_file=50257,
 58 |         n_positions=1024,
 59 |         n_ctx=1024,
 60 |         n_embd=1024,
 61 |         n_layer=24,
 62 |         n_head=16,
 63 |         resid_pdrop=0.1,
 64 |         embd_pdrop=0.1,
 65 |         attn_pdrop=0.1,
 66 |         layer_norm_epsilon=1e-5,
 67 |         initializer_range=0.02,
 68 | 
 69 |         num_labels=1,
 70 |         summary_type='cls_index',
 71 |         summary_use_proj=True,
 72 |         summary_activation=None,
 73 |         summary_proj_to_labels=True,
 74 |         summary_first_dropout=0.1,
 75 |         **kwargs
 76 |     ):
 77 |         """Constructs GPT2Config.
 78 | 
 79 |         Args:
 80 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
 81 |             n_positions: Number of positional embeddings.
 82 |             n_ctx: Size of the causal mask (usually same as n_positions).
 83 |             n_embd: Dimensionality of the embeddings and hidden states.
 84 |             n_layer: Number of hidden layers in the Transformer encoder.
 85 |             n_head: Number of attention heads for each attention layer in
 86 |                 the Transformer encoder.
 87 |             layer_norm_epsilon: epsilon to use in the layer norm layers
 88 |             resid_pdrop: The dropout probabilitiy for all fully connected
 89 |                 layers in the embeddings, encoder, and pooler.
 90 |             attn_pdrop: The dropout ratio for the attention
 91 |                 probabilities.
 92 |             embd_pdrop: The dropout ratio for the embeddings.
 93 |             initializer_range: The sttdev of the truncated_normal_initializer for
 94 |                 initializing all weight matrices.
 95 |         """
 96 |         super(GPT2Config, self).__init__(**kwargs)
 97 | 
 98 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
 99 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
100 |             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
101 |                 json_config = json.loads(reader.read())
102 |             for key, value in json_config.items():
103 |                 self.__dict__[key] = value
104 |         elif isinstance(vocab_size_or_config_json_file, int):
105 |             self.vocab_size = vocab_size_or_config_json_file
106 |             self.n_ctx = n_ctx
107 |             self.n_positions = n_positions
108 |             self.n_embd = n_embd
109 |             self.n_layer = n_layer
110 |             self.n_head = n_head
111 |             self.resid_pdrop = resid_pdrop
112 |             self.embd_pdrop = embd_pdrop
113 |             self.attn_pdrop = attn_pdrop
114 |             self.layer_norm_epsilon = layer_norm_epsilon
115 |             self.initializer_range = initializer_range
116 | 
117 |             self.num_labels = num_labels
118 |             self.summary_type = summary_type
119 |             self.summary_use_proj = summary_use_proj
120 |             self.summary_activation = summary_activation
121 |             self.summary_first_dropout = summary_first_dropout
122 |             self.summary_proj_to_labels = summary_proj_to_labels
123 |         else:
124 |             raise ValueError(
125 |                 "First argument must be either a vocabulary size (int)"
126 |                 "or the path to a pretrained model config file (str)"
127 |             )
128 | 
129 |     @property
130 |     def max_position_embeddings(self):
131 |         return self.n_positions
132 | 
133 |     @property
134 |     def hidden_size(self):
135 |         return self.n_embd
136 | 
137 |     @property
138 |     def num_attention_heads(self):
139 |         return self.n_head
140 | 
141 |     @property
142 |     def num_hidden_layers(self):
143 |         return self.n_layer
144 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/tests/optimization_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import
 16 | from __future__ import division
 17 | from __future__ import print_function
 18 | 
 19 | import unittest
 20 | import os
 21 | 
 22 | import torch
 23 | 
 24 | from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
 25 |                                   WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
 26 | 
 27 | from .tokenization_tests_commons import TemporaryDirectory
 28 | 
 29 | 
 30 | def unwrap_schedule(scheduler, num_steps=10):
 31 |     lrs = []
 32 |     for _ in range(num_steps):
 33 |         scheduler.step()
 34 |         lrs.append(scheduler.get_lr())
 35 |     return lrs
 36 | 
 37 | def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
 38 |     lrs = []
 39 |     for step in range(num_steps):
 40 |         scheduler.step()
 41 |         lrs.append(scheduler.get_lr())
 42 |         if step == num_steps // 2:
 43 |             with TemporaryDirectory() as tmpdirname:
 44 |                 file_name = os.path.join(tmpdirname, 'schedule.bin')
 45 |                 torch.save(scheduler.state_dict(), file_name)
 46 | 
 47 |                 state_dict = torch.load(file_name)
 48 |                 scheduler.load_state_dict(state_dict)
 49 |     return lrs
 50 | 
 51 | class OptimizationTest(unittest.TestCase):
 52 | 
 53 |     def assertListAlmostEqual(self, list1, list2, tol):
 54 |         self.assertEqual(len(list1), len(list2))
 55 |         for a, b in zip(list1, list2):
 56 |             self.assertAlmostEqual(a, b, delta=tol)
 57 | 
 58 |     def test_adam_w(self):
 59 |         w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
 60 |         target = torch.tensor([0.4, 0.2, -0.5])
 61 |         criterion = torch.nn.MSELoss()
 62 |         # No warmup, constant schedule, no gradient clipping
 63 |         optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0)
 64 |         for _ in range(100):
 65 |             loss = criterion(w, target)
 66 |             loss.backward()
 67 |             optimizer.step()
 68 |             w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves.
 69 |             w.grad.zero_()
 70 |         self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
 71 | 
 72 | 
 73 | class ScheduleInitTest(unittest.TestCase):
 74 |     m = torch.nn.Linear(50, 50)
 75 |     optimizer = AdamW(m.parameters(), lr=10.)
 76 |     num_steps = 10
 77 | 
 78 |     def assertListAlmostEqual(self, list1, list2, tol):
 79 |         self.assertEqual(len(list1), len(list2))
 80 |         for a, b in zip(list1, list2):
 81 |             self.assertAlmostEqual(a, b, delta=tol)
 82 | 
 83 |     def test_constant_scheduler(self):
 84 |         scheduler = ConstantLRSchedule(self.optimizer)
 85 |         lrs = unwrap_schedule(scheduler, self.num_steps)
 86 |         expected_learning_rates = [10.] * self.num_steps
 87 |         self.assertEqual(len(lrs[0]), 1)
 88 |         self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
 89 | 
 90 |         scheduler = ConstantLRSchedule(self.optimizer)
 91 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
 92 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
 93 | 
 94 |     def test_warmup_constant_scheduler(self):
 95 |         scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4)
 96 |         lrs = unwrap_schedule(scheduler, self.num_steps)
 97 |         expected_learning_rates = [2.5, 5.0, 7.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]
 98 |         self.assertEqual(len(lrs[0]), 1)
 99 |         self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
100 | 
101 |         scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4)
102 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
103 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
104 | 
105 |     def test_warmup_linear_scheduler(self):
106 |         scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10)
107 |         lrs = unwrap_schedule(scheduler, self.num_steps)
108 |         expected_learning_rates = [5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0]
109 |         self.assertEqual(len(lrs[0]), 1)
110 |         self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
111 | 
112 |         scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10)
113 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
114 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
115 | 
116 |     def test_warmup_cosine_scheduler(self):
117 |         scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10)
118 |         lrs = unwrap_schedule(scheduler, self.num_steps)
119 |         expected_learning_rates = [5.0, 10.0, 9.61, 8.53, 6.91, 5.0, 3.08, 1.46, 0.38, 0.0]
120 |         self.assertEqual(len(lrs[0]), 1)
121 |         self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
122 | 
123 |         scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10)
124 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
125 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
126 | 
127 |     def test_warmup_cosine_hard_restart_scheduler(self):
128 |         scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10)
129 |         lrs = unwrap_schedule(scheduler, self.num_steps)
130 |         expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0]
131 |         self.assertEqual(len(lrs[0]), 1)
132 |         self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
133 | 
134 |         scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10)
135 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
136 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
137 | 
138 | if __name__ == "__main__":
139 |     unittest.main()
140 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/configuration_bert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ BERT model configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
 31 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
 32 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
 33 |     'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
 34 |     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
 35 |     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
 36 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
 37 |     'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
 38 |     'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
 39 |     'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
 40 |     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
 41 |     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
 42 |     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
 43 | }
 44 | 
 45 | 
 46 | class BertConfig(PretrainedConfig):
 47 |     r"""
 48 |         :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
 49 |         `BertModel`.
 50 | 
 51 | 
 52 |         Arguments:
 53 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
 54 |             hidden_size: Size of the encoder layers and the pooler layer.
 55 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
 56 |             num_attention_heads: Number of attention heads for each attention layer in
 57 |                 the Transformer encoder.
 58 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 59 |                 layer in the Transformer encoder.
 60 |             hidden_act: The non-linear activation function (function or string) in the
 61 |                 encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 62 |             hidden_dropout_prob: The dropout probabilitiy for all fully connected
 63 |                 layers in the embeddings, encoder, and pooler.
 64 |             attention_probs_dropout_prob: The dropout ratio for the attention
 65 |                 probabilities.
 66 |             max_position_embeddings: The maximum sequence length that this model might
 67 |                 ever be used with. Typically set this to something large just in case
 68 |                 (e.g., 512 or 1024 or 2048).
 69 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 70 |                 `BertModel`.
 71 |             initializer_range: The sttdev of the truncated_normal_initializer for
 72 |                 initializing all weight matrices.
 73 |             layer_norm_eps: The epsilon used by LayerNorm.
 74 |     """
 75 |     pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 76 | 
 77 |     def __init__(self,
 78 |                  vocab_size_or_config_json_file=30522,
 79 |                  hidden_size=768,
 80 |                  num_hidden_layers=12,
 81 |                  num_attention_heads=12,
 82 |                  intermediate_size=3072,
 83 |                  hidden_act="gelu",
 84 |                  hidden_dropout_prob=0.1,
 85 |                  attention_probs_dropout_prob=0.1,
 86 |                  max_position_embeddings=512,
 87 |                  type_vocab_size=2,
 88 |                  initializer_range=0.02,
 89 |                  layer_norm_eps=1e-12,
 90 |                  **kwargs):
 91 |         super(BertConfig, self).__init__(**kwargs)
 92 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
 93 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
 94 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
 95 |                 json_config = json.loads(reader.read())
 96 |             for key, value in json_config.items():
 97 |                 self.__dict__[key] = value
 98 |         elif isinstance(vocab_size_or_config_json_file, int):
 99 |             self.vocab_size = vocab_size_or_config_json_file
100 |             self.hidden_size = hidden_size
101 |             self.num_hidden_layers = num_hidden_layers
102 |             self.num_attention_heads = num_attention_heads
103 |             self.hidden_act = hidden_act
104 |             self.intermediate_size = intermediate_size
105 |             self.hidden_dropout_prob = hidden_dropout_prob
106 |             self.attention_probs_dropout_prob = attention_probs_dropout_prob
107 |             self.max_position_embeddings = max_position_embeddings
108 |             self.type_vocab_size = type_vocab_size
109 |             self.initializer_range = initializer_range
110 |             self.layer_norm_eps = layer_norm_eps
111 |         else:
112 |             raise ValueError("First argument must be either a vocabulary size (int)"
113 |                              " or the path to a pretrained model config file (str)")
114 | 


--------------------------------------------------------------------------------
/language_modeling/examples/distillation/README.md:
--------------------------------------------------------------------------------
  1 | # DistilBERT
  2 | 
  3 | This folder contains the original code used to train DistilBERT as well as examples showcasing how to use DistilBERT.
  4 | 
  5 | ## What is DistilBERT
  6 | 
  7 | DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
  8 | 
  9 | For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5
 10 | ).
 11 | 
 12 | ## Setup
 13 | 
 14 | This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. 
 15 | 
 16 | **Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/pytorch-transformers/issues/1179) for more details.
 17 | 
 18 | ## How to use DistilBERT
 19 | 
 20 | PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
 21 | 
 22 | - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
 23 | - `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.2 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
 24 | 
 25 | Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
 26 | 
 27 | ```python
 28 | tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
 29 | model = DistilBertModel.from_pretrained('distilbert-base-uncased')
 30 | 
 31 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)
 32 | outputs = model(input_ids)
 33 | last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 34 | ```
 35 | 
 36 | ## How to train DistilBERT
 37 | 
 38 | In the following, we will explain how you can train your own compressed model.
 39 | 
 40 | ### A. Preparing the data
 41 | 
 42 | The weights we release are trained using a concatenation of Toronto Book Corpus and English Wikipedia (same training data as the English version of BERT).
 43 | 
 44 | To avoid processing the data several time, we do it once and for all before the training. From now on, will suppose that you have a text file `dump.txt` which contains one sequence per line (a sequence being composed of one of several coherent sentences).
 45 | 
 46 | First, we will binarize the data, i.e. tokenize the data and convert each token in an index in our model's vocabulary.
 47 | 
 48 | ```bash
 49 | python scripts/binarized_data.py \
 50 |     --file_path data/dump.txt \
 51 |     --bert_tokenizer bert-base-uncased \
 52 |     --dump_file data/binarized_text
 53 | ```
 54 | 
 55 | Our implementation of masked language modeling loss follows [XLM](https://github.com/facebookresearch/XLM)'s one and smoothes the probability of masking with a factor that put more emphasis on rare words. Thus we count the occurences of each tokens in the data:
 56 | 
 57 | ```bash
 58 | python scripts/token_counts.py \
 59 |     --data_file data/binarized_text.bert-base-uncased.pickle \
 60 |     --token_counts_dump data/token_counts.bert-base-uncased.pickle
 61 | ```
 62 | 
 63 | ### B. Training
 64 | 
 65 | Training with distillation is really simple once you have pre-processed the data:
 66 | 
 67 | ```bash
 68 | python train.py \
 69 |     --dump_path serialization_dir/my_first_training \
 70 |     --data_file data/binarized_text.bert-base-uncased.pickle \
 71 |     --token_counts data/token_counts.bert-base-uncased.pickle \
 72 |     --force # overwrites the `dump_path` if it already exists.
 73 | ```
 74 | 
 75 | By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them.
 76 | 
 77 | We highly encourage you to use distributed training for training DistilBert as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
 78 | 
 79 | ```bash
 80 | export NODE_RANK=0
 81 | export N_NODES=1
 82 | 
 83 | export N_GPU_NODE=4
 84 | export WORLD_SIZE=4
 85 | export MASTER_PORT=<AN_OPEN_PORT>
 86 | export MASTER_ADDR=<I.P.>
 87 | 
 88 | pkill -f 'python -u train.py'
 89 | 
 90 | python -m torch.distributed.launch \
 91 |     --nproc_per_node=$N_GPU_NODE \
 92 |     --nnodes=$N_NODES \
 93 |     --node_rank $NODE_RANK \
 94 |     --master_addr $MASTER_ADDR \
 95 |     --master_port $MASTER_PORT \
 96 |     train.py \
 97 |         --force \
 98 |         --n_gpu $WORLD_SIZE \
 99 |         --data_file data/binarized_text.bert-base-uncased.pickle \
100 |         --token_counts data/token_counts.bert-base-uncased.pickle \
101 |         --dump_path serialization_dir/my_first_distillation
102 | ```
103 | 
104 | **Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract_for_distil.py` to create a valid initialization checkpoint and use `--from_pretrained_weights` and `--from_pretrained_config` arguments to use this initialization for the distilled training!
105 | 
106 | Happy distillation!
107 | 


--------------------------------------------------------------------------------
/language_modeling/hubconf.py:
--------------------------------------------------------------------------------
  1 | from pytorch_transformers import (
  2 |     AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
  3 | )
  4 | from pytorch_transformers.file_utils import add_start_docstrings
  5 | 
  6 | dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
  7 | 
  8 | @add_start_docstrings(AutoConfig.__doc__)
  9 | def config(*args, **kwargs):
 10 |     r""" 
 11 |                 # Using torch.hub !
 12 |                 import torch
 13 | 
 14 |                 config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
 15 |                 config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
 16 |                 config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
 17 |                 config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
 18 |                 assert config.output_attention == True
 19 |                 config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
 20 |                 assert config.output_attention == True
 21 |                 assert unused_kwargs == {'foo': False}
 22 | 
 23 |             """
 24 | 
 25 |     return AutoConfig.from_pretrained(*args, **kwargs)
 26 | 
 27 | 
 28 | @add_start_docstrings(AutoTokenizer.__doc__)
 29 | def tokenizer(*args, **kwargs):
 30 |     r""" 
 31 |         # Using torch.hub !
 32 |         import torch
 33 | 
 34 |         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
 35 |         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
 36 | 
 37 |     """
 38 | 
 39 |     return AutoTokenizer.from_pretrained(*args, **kwargs)
 40 | 
 41 | 
 42 | @add_start_docstrings(AutoModel.__doc__)
 43 | def model(*args, **kwargs):
 44 |     r"""
 45 |             # Using torch.hub !
 46 |             import torch
 47 | 
 48 |             model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
 49 |             model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
 50 |             model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
 51 |             assert model.config.output_attention == True
 52 |             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
 53 |             config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
 54 |             model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 55 | 
 56 |         """
 57 | 
 58 |     return AutoModel.from_pretrained(*args, **kwargs)
 59 | 
 60 | @add_start_docstrings(AutoModelWithLMHead.__doc__)
 61 | def modelWithLMHead(*args, **kwargs):
 62 |     r"""
 63 |         # Using torch.hub !
 64 |         import torch
 65 | 
 66 |         model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
 67 |         model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
 68 |         model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
 69 |         assert model.config.output_attention == True
 70 |         # Loading from a TF checkpoint file instead of a PyTorch model (slower)
 71 |         config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
 72 |         model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 73 | 
 74 |     """
 75 |     return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
 76 | 
 77 | 
 78 | @add_start_docstrings(AutoModelForSequenceClassification.__doc__)
 79 | def modelForSequenceClassification(*args, **kwargs):
 80 |     r"""
 81 |             # Using torch.hub !
 82 |             import torch
 83 | 
 84 |             model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
 85 |             model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
 86 |             model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
 87 |             assert model.config.output_attention == True
 88 |             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
 89 |             config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
 90 |             model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 91 | 
 92 |         """
 93 | 
 94 |     return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
 95 | 
 96 | 
 97 | @add_start_docstrings(AutoModelForQuestionAnswering.__doc__)
 98 | def modelForQuestionAnswering(*args, **kwargs):
 99 |     r"""
100 |         # Using torch.hub !
101 |         import torch
102 | 
103 |         model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
104 |         model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
105 |         model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
106 |         assert model.config.output_attention == True
107 |         # Loading from a TF checkpoint file instead of a PyTorch model (slower)
108 |         config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
109 |         model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
110 | 
111 |     """
112 |     return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
113 | 


--------------------------------------------------------------------------------
/language_modeling/examples/single_model_scripts/run_transfo_xl.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ PyTorch Transformer XL model evaluation script.
 17 |     Adapted from https://github.com/kimiyoung/transformer-xl.
 18 |     In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
 19 | 
 20 |     This script with default values evaluates a pretrained Transformer-XL on WikiText 103
 21 | """
 22 | from __future__ import absolute_import, division, print_function, unicode_literals
 23 | 
 24 | import argparse
 25 | import logging
 26 | import time
 27 | import math
 28 | 
 29 | import torch
 30 | 
 31 | from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
 32 | 
 33 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 34 |                     datefmt = '%m/%d/%Y %H:%M:%S',
 35 |                     level = logging.INFO)
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | def main():
 39 |     parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
 40 |     parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
 41 |                         help='pretrained model name')
 42 |     parser.add_argument('--split', type=str, default='test',
 43 |                         choices=['all', 'valid', 'test'],
 44 |                         help='which split to evaluate')
 45 |     parser.add_argument('--batch_size', type=int, default=10,
 46 |                         help='batch size')
 47 |     parser.add_argument('--tgt_len', type=int, default=128,
 48 |                         help='number of tokens to predict')
 49 |     parser.add_argument('--ext_len', type=int, default=0,
 50 |                         help='length of the extended context')
 51 |     parser.add_argument('--mem_len', type=int, default=1600,
 52 |                         help='length of the retained previous heads')
 53 |     parser.add_argument('--clamp_len', type=int, default=1000,
 54 |                         help='max positional embedding index')
 55 |     parser.add_argument('--no_cuda', action='store_true',
 56 |                         help='Do not use CUDA even though CUA is available')
 57 |     parser.add_argument('--work_dir', type=str, required=True,
 58 |                         help='path to the work_dir')
 59 |     parser.add_argument('--no_log', action='store_true',
 60 |                         help='do not log the eval result')
 61 |     parser.add_argument('--same_length', action='store_true',
 62 |                         help='set same length attention with masking')
 63 |     parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
 64 |     parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
 65 |     args = parser.parse_args()
 66 |     assert args.ext_len >= 0, 'extended context length must be non-negative'
 67 | 
 68 |     if args.server_ip and args.server_port:
 69 |         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
 70 |         import ptvsd
 71 |         print("Waiting for debugger attach")
 72 |         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
 73 |         ptvsd.wait_for_attach()
 74 | 
 75 |     device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
 76 |     logger.info("device: {}".format(device))
 77 | 
 78 |     # Load a pre-processed dataset
 79 |     # You can also build the corpus yourself using TransfoXLCorpus methods
 80 |     # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
 81 |     # and tokenizing the dataset
 82 |     # The pre-processed corpus is a convertion (using the conversion script )
 83 |     tokenizer = TransfoXLTokenizer.from_pretrained(args.model_name)
 84 |     corpus = TransfoXLCorpus.from_pretrained(args.model_name)
 85 |     ntokens = len(corpus.vocab)
 86 | 
 87 |     va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
 88 |         device=device, ext_len=args.ext_len)
 89 |     te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
 90 |         device=device, ext_len=args.ext_len)
 91 | 
 92 |     # Load a pre-trained model
 93 |     model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
 94 |     model = model.to(device)
 95 | 
 96 |     logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
 97 |         args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
 98 | 
 99 |     model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
100 |     if args.clamp_len > 0:
101 |         model.clamp_len = args.clamp_len
102 |     if args.same_length:
103 |         model.same_length = True
104 | 
105 |     ###############################################################################
106 |     # Evaluation code
107 |     ###############################################################################
108 |     def evaluate(eval_iter):
109 |         # Turn on evaluation mode which disables dropout.
110 |         model.eval()
111 |         total_len, total_loss = 0, 0.
112 |         start_time = time.time()
113 |         with torch.no_grad():
114 |             mems = None
115 |             for idx, (data, target, seq_len) in enumerate(eval_iter):
116 |                 ret = model(data, target, mems)
117 |                 loss, _, mems = ret
118 |                 loss = loss.mean()
119 |                 total_loss += seq_len * loss.item()
120 |                 total_len += seq_len
121 |             total_time = time.time() - start_time
122 |         logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
123 |                 total_time, 1000 * total_time / (idx+1)))
124 |         return total_loss / total_len
125 | 
126 |     # Run on test data.
127 |     if args.split == 'all':
128 |         test_loss = evaluate(te_iter)
129 |         valid_loss = evaluate(va_iter)
130 |     elif args.split == 'valid':
131 |         valid_loss = evaluate(va_iter)
132 |         test_loss = None
133 |     elif args.split == 'test':
134 |         test_loss = evaluate(te_iter)
135 |         valid_loss = None
136 | 
137 |     def format_log(loss, split):
138 |         log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
139 |             split, loss, math.exp(loss))
140 |         return log_str
141 | 
142 |     log_str = ''
143 |     if valid_loss is not None:
144 |         log_str += format_log(valid_loss, 'valid')
145 |     if test_loss is not None:
146 |         log_str += format_log(test_loss, 'test')
147 | 
148 |     logger.info('=' * 100)
149 |     logger.info(log_str)
150 |     logger.info('=' * 100)
151 | 
152 | if __name__ == '__main__':
153 |     main()
154 | 


--------------------------------------------------------------------------------
/language_modeling/pytorch_transformers/__main__.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | def main():
  3 |     import sys
  4 |     if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
  5 |         print(
  6 |         "Should be used as one of: \n"
  7 |         ">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
  8 |         ">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
  9 |         ">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
 10 |         ">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
 11 |         ">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
 12 |         ">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
 13 |     else:
 14 |         if sys.argv[1] == "bert":
 15 |             try:
 16 |                 from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
 17 |             except ImportError:
 18 |                 print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 19 |                     "In that case, it requires TensorFlow to be installed. Please see "
 20 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 21 |                 raise
 22 | 
 23 |             if len(sys.argv) != 5:
 24 |                 # pylint: disable=line-too-long
 25 |                 print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
 26 |             else:
 27 |                 PYTORCH_DUMP_OUTPUT = sys.argv.pop()
 28 |                 TF_CONFIG = sys.argv.pop()
 29 |                 TF_CHECKPOINT = sys.argv.pop()
 30 |                 convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
 31 |         elif sys.argv[1] == "gpt":
 32 |             from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
 33 |             if len(sys.argv) < 4 or len(sys.argv) > 5:
 34 |                 # pylint: disable=line-too-long
 35 |                 print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
 36 |             else:
 37 |                 OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
 38 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
 39 |                 if len(sys.argv) == 5:
 40 |                     OPENAI_GPT_CONFIG = sys.argv[4]
 41 |                 else:
 42 |                     OPENAI_GPT_CONFIG = ""
 43 |                 convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
 44 |                                                     OPENAI_GPT_CONFIG,
 45 |                                                     PYTORCH_DUMP_OUTPUT)
 46 |         elif sys.argv[1] == "transfo_xl":
 47 |             try:
 48 |                 from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
 49 |             except ImportError:
 50 |                 print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 51 |                     "In that case, it requires TensorFlow to be installed. Please see "
 52 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 53 |                 raise
 54 |             if len(sys.argv) < 4 or len(sys.argv) > 5:
 55 |                 # pylint: disable=line-too-long
 56 |                 print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
 57 |             else:
 58 |                 if 'ckpt' in sys.argv[2].lower():
 59 |                     TF_CHECKPOINT = sys.argv[2]
 60 |                     TF_DATASET_FILE = ""
 61 |                 else:
 62 |                     TF_DATASET_FILE = sys.argv[2]
 63 |                     TF_CHECKPOINT = ""
 64 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
 65 |                 if len(sys.argv) == 5:
 66 |                     TF_CONFIG = sys.argv[4]
 67 |                 else:
 68 |                     TF_CONFIG = ""
 69 |                 convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
 70 |         elif sys.argv[1] == "gpt2":
 71 |             try:
 72 |                 from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
 73 |             except ImportError:
 74 |                 print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 75 |                     "In that case, it requires TensorFlow to be installed. Please see "
 76 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 77 |                 raise
 78 | 
 79 |             if len(sys.argv) < 4 or len(sys.argv) > 5:
 80 |                 # pylint: disable=line-too-long
 81 |                 print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
 82 |             else:
 83 |                 TF_CHECKPOINT = sys.argv[2]
 84 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
 85 |                 if len(sys.argv) == 5:
 86 |                     TF_CONFIG = sys.argv[4]
 87 |                 else:
 88 |                     TF_CONFIG = ""
 89 |                 convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
 90 |         elif sys.argv[1] == "xlnet":
 91 |             try:
 92 |                 from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
 93 |             except ImportError:
 94 |                 print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 95 |                     "In that case, it requires TensorFlow to be installed. Please see "
 96 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 97 |                 raise
 98 | 
 99 |             if len(sys.argv) < 5 or len(sys.argv) > 6:
100 |                 # pylint: disable=line-too-long
101 |                 print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
102 |             else:
103 |                 TF_CHECKPOINT = sys.argv[2]
104 |                 TF_CONFIG = sys.argv[3]
105 |                 PYTORCH_DUMP_OUTPUT = sys.argv[4]
106 |                 if len(sys.argv) == 6:
107 |                     FINETUNING_TASK = sys.argv[5]
108 |                 else:
109 |                     FINETUNING_TASK = None
110 | 
111 |                 convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT,
112 |                                                     TF_CONFIG,
113 |                                                     PYTORCH_DUMP_OUTPUT,
114 |                                                     FINETUNING_TASK)
115 |         elif sys.argv[1] == "xlm":
116 |             from .convert_xlm_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
117 | 
118 |             if len(sys.argv) != 4:
119 |                 # pylint: disable=line-too-long
120 |                 print("Should be used as `pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
121 |             else:
122 |                 XLM_CHECKPOINT_PATH = sys.argv[2]
123 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
124 | 
125 |                 convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT)
126 | 
127 | if __name__ == '__main__':
128 |     main()
129 | 


--------------------------------------------------------------------------------