├── MANIFEST.in ├── samples ├── input.txt └── sample_text.txt ├── docker └── Dockerfile ├── requirements.txt ├── .github └── stale.yml ├── .circleci └── config.yml ├── pytorch_pretrained_bert ├── __init__.py ├── convert_tf_checkpoint_to_pytorch.py ├── convert_gpt2_checkpoint_to_pytorch.py ├── convert_openai_checkpoint_to_pytorch.py ├── __main__.py ├── convert_transfo_xl_checkpoint_to_pytorch.py ├── optimization_openai.py ├── optimization.py ├── file_utils.py ├── tokenization_gpt2.py └── tokenization_openai.py ├── tests ├── optimization_test.py ├── tokenization_openai_test.py ├── tokenization_transfo_xl_test.py ├── tokenization_test.py ├── modeling_gpt2_test.py ├── modeling_transfo_xl_test.py ├── modeling_openai_test.py └── modeling_test.py ├── .gitignore ├── setup.py ├── examples ├── run_gpt2.py ├── lm_finetuning │ ├── README.md │ ├── pregenerate_training_data.py │ └── finetune_on_pregenerated.py ├── run_transfo_xl.py ├── extract_features.py └── run_openai_gpt.py └── LICENSE /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | -------------------------------------------------------------------------------- /samples/input.txt: -------------------------------------------------------------------------------- 1 | Who was Jim Henson ? ||| Jim Henson was a puppeteer 2 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:latest 2 | 3 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext 4 | 5 | RUN pip install pytorch-pretrained-bert 6 | 7 | WORKDIR /workspace -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # PyTorch 2 | torch>=0.4.1 3 | # progress bars in model download and training scripts 4 | tqdm 5 | # Accessing files from S3 directly. 6 | boto3 7 | # Used for downloading models over HTTP 8 | requests 9 | # For OpenAI GPT 10 | regex -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 60 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: wontfix 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build_py3: 4 | working_directory: ~/pytorch-pretrained-BERT 5 | docker: 6 | - image: circleci/python:3.5 7 | steps: 8 | - checkout 9 | - run: sudo pip install --progress-bar off . 10 | - run: sudo pip install pytest ftfy spacy 11 | - run: sudo python -m spacy download en 12 | - run: python -m pytest -sv tests/ 13 | build_py2: 14 | working_directory: ~/pytorch-pretrained-BERT 15 | docker: 16 | - image: circleci/python:2.7 17 | steps: 18 | - checkout 19 | - run: sudo pip install --progress-bar off . 20 | - run: sudo pip install pytest spacy 21 | - run: sudo pip install ftfy==4.4.3 22 | - run: sudo python -m spacy download en 23 | - run: python -m pytest -sv tests/ 24 | workflows: 25 | version: 2 26 | build_and_test: 27 | jobs: 28 | - build_py3 29 | - build_py2 -------------------------------------------------------------------------------- /pytorch_pretrained_bert/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.6.1" 2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer 3 | from .tokenization_openai import OpenAIGPTTokenizer 4 | from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) 5 | from .tokenization_gpt2 import GPT2Tokenizer 6 | 7 | from .modeling import (BertConfig, BertModel, BertForPreTraining, 8 | BertForMaskedLM, BertForNextSentencePrediction, 9 | BertForSequenceClassification, BertForMultipleChoice, 10 | BertForTokenClassification, BertForQuestionAnswering, 11 | load_tf_weights_in_bert) 12 | from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel, 13 | OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, 14 | load_tf_weights_in_openai_gpt) 15 | from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel, 16 | load_tf_weights_in_transfo_xl) 17 | from .modeling_gpt2 import (GPT2Config, GPT2Model, 18 | GPT2LMHeadModel, GPT2DoubleHeadsModel, 19 | load_tf_weights_in_gpt2) 20 | 21 | from .optimization import BertAdam 22 | from .optimization_openai import OpenAIAdam 23 | 24 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path 25 | -------------------------------------------------------------------------------- /tests/optimization_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | 21 | import torch 22 | 23 | from pytorch_pretrained_bert import BertAdam 24 | 25 | class OptimizationTest(unittest.TestCase): 26 | 27 | def assertListAlmostEqual(self, list1, list2, tol): 28 | self.assertEqual(len(list1), len(list2)) 29 | for a, b in zip(list1, list2): 30 | self.assertAlmostEqual(a, b, delta=tol) 31 | 32 | def test_adam(self): 33 | w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True) 34 | target = torch.tensor([0.4, 0.2, -0.5]) 35 | criterion = torch.nn.MSELoss() 36 | # No warmup, constant schedule, no gradient clipping 37 | optimizer = BertAdam(params=[w], lr=2e-1, 38 | weight_decay=0.0, 39 | max_grad_norm=-1) 40 | for _ in range(100): 41 | loss = criterion(w, target) 42 | loss.backward() 43 | optimizer.step() 44 | w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves. 45 | w.grad.zero_() 46 | self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2) 47 | 48 | 49 | if __name__ == "__main__": 50 | unittest.main() 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ 117 | 118 | # vscode 119 | .vscode 120 | 121 | # TF code 122 | tensorflow_code 123 | 124 | # Models 125 | models -------------------------------------------------------------------------------- /tests/tokenization_openai_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | import json 20 | 21 | from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer 22 | 23 | 24 | class OpenAIGPTTokenizationTest(unittest.TestCase): 25 | 26 | def test_full_tokenizer(self): 27 | """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """ 28 | vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", 29 | "w", "r", "t", 30 | "lo", "low", "er", 31 | "low", "lowest", "newer", "wider"] 32 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 33 | merges = ["#version: 0.2", "l o", "lo w", "e r", ""] 34 | with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp: 35 | json.dump(vocab_tokens, fp) 36 | vocab_file = fp.name 37 | with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp: 38 | fp.write("\n".join(merges)) 39 | merges_file = fp.name 40 | 41 | tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=[""]) 42 | os.remove(vocab_file) 43 | os.remove(merges_file) 44 | 45 | text = "lower" 46 | bpe_tokens = ["low", "er"] 47 | tokens = tokenizer.tokenize(text) 48 | self.assertListEqual(tokens, bpe_tokens) 49 | 50 | input_tokens = tokens + [""] 51 | input_bpe_tokens = [14, 15, 20] 52 | self.assertListEqual( 53 | tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 54 | 55 | if __name__ == '__main__': 56 | unittest.main() 57 | -------------------------------------------------------------------------------- /pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import re 23 | import argparse 24 | import tensorflow as tf 25 | import torch 26 | import numpy as np 27 | 28 | from pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert 29 | 30 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 31 | # Initialise PyTorch model 32 | config = BertConfig.from_json_file(bert_config_file) 33 | print("Building PyTorch model from configuration: {}".format(str(config))) 34 | model = BertForPreTraining(config) 35 | 36 | # Load weights from tf checkpoint 37 | load_tf_weights_in_bert(model, tf_checkpoint_path) 38 | 39 | # Save pytorch-model 40 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 41 | torch.save(model.state_dict(), pytorch_dump_path) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | ## Required parameters 47 | parser.add_argument("--tf_checkpoint_path", 48 | default = None, 49 | type = str, 50 | required = True, 51 | help = "Path the TensorFlow checkpoint path.") 52 | parser.add_argument("--bert_config_file", 53 | default = None, 54 | type = str, 55 | required = True, 56 | help = "The config json file corresponding to the pre-trained BERT model. \n" 57 | "This specifies the model architecture.") 58 | parser.add_argument("--pytorch_dump_path", 59 | default = None, 60 | type = str, 61 | required = True, 62 | help = "Path to the output PyTorch model.") 63 | args = parser.parse_args() 64 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, 65 | args.bert_config_file, 66 | args.pytorch_dump_path) 67 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py 3 | 4 | To create the package for pypi. 5 | 6 | 1. Change the version in __init__.py and setup.py. 7 | 8 | 2. Commit these changes with the message: "Release: VERSION" 9 | 10 | 3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' " 11 | Push the tag to git: git push --tags origin master 12 | 13 | 4. Build both the sources and the wheel. Do not change anything in setup.py between 14 | creating the wheel and the source distribution (obviously). 15 | 16 | For the wheel, run: "python setup.py bdist_wheel" in the top level allennlp directory. 17 | (this will build a wheel for the python version you use to build it - make sure you use python 3.x). 18 | 19 | For the sources, run: "python setup.py sdist" 20 | You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp. 21 | 22 | 5. Check that everything looks correct by uploading the package to the pypi test server: 23 | 24 | twine upload dist/* -r pypitest 25 | (pypi suggest using twine as other methods upload files via plaintext.) 26 | 27 | Check that you can install it in a virtualenv by running: 28 | pip install -i https://testpypi.python.org/pypi allennlp 29 | 30 | 6. Upload the final version to actual pypi: 31 | twine upload dist/* -r pypi 32 | 33 | 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory. 34 | 35 | """ 36 | from io import open 37 | from setuptools import find_packages, setup 38 | 39 | setup( 40 | name="pytorch_pretrained_bert", 41 | version="0.6.1", 42 | author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors", 43 | author_email="thomas@huggingface.co", 44 | description="PyTorch version of Google AI BERT model with script to load Google pre-trained models", 45 | long_description=open("README.md", "r", encoding='utf-8').read(), 46 | long_description_content_type="text/markdown", 47 | keywords='BERT NLP deep learning google', 48 | license='Apache', 49 | url="https://github.com/huggingface/pytorch-pretrained-BERT", 50 | packages=find_packages(exclude=["*.tests", "*.tests.*", 51 | "tests.*", "tests"]), 52 | install_requires=['torch>=0.4.1', 53 | 'numpy', 54 | 'boto3', 55 | 'requests', 56 | 'tqdm', 57 | 'regex'], 58 | entry_points={ 59 | 'console_scripts': [ 60 | "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main", 61 | ] 62 | }, 63 | # python_requires='>=3.5.0', 64 | tests_require=['pytest'], 65 | classifiers=[ 66 | 'Intended Audience :: Science/Research', 67 | 'License :: OSI Approved :: Apache Software License', 68 | 'Programming Language :: Python :: 3', 69 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 70 | ], 71 | ) 72 | -------------------------------------------------------------------------------- /pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | from io import open 21 | 22 | import torch 23 | 24 | from pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME, 25 | GPT2Config, 26 | GPT2Model, 27 | load_tf_weights_in_gpt2) 28 | 29 | 30 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): 31 | # Construct model 32 | if gpt2_config_file == "": 33 | config = GPT2Config() 34 | else: 35 | config = GPT2Config(gpt2_config_file) 36 | model = GPT2Model(config) 37 | 38 | # Load weights from numpy 39 | load_tf_weights_in_gpt2(model, gpt2_checkpoint_path) 40 | 41 | # Save pytorch-model 42 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 43 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 44 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 45 | torch.save(model.state_dict(), pytorch_weights_dump_path) 46 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 47 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 48 | f.write(config.to_json_string()) 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | ## Required parameters 54 | parser.add_argument("--gpt2_checkpoint_path", 55 | default = None, 56 | type = str, 57 | required = True, 58 | help = "Path the TensorFlow checkpoint path.") 59 | parser.add_argument("--pytorch_dump_folder_path", 60 | default = None, 61 | type = str, 62 | required = True, 63 | help = "Path to the output PyTorch model.") 64 | parser.add_argument("--gpt2_config_file", 65 | default = "", 66 | type = str, 67 | help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" 68 | "This specifies the model architecture.") 69 | args = parser.parse_args() 70 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, 71 | args.gpt2_config_file, 72 | args.pytorch_dump_folder_path) 73 | -------------------------------------------------------------------------------- /pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | from io import open 21 | 22 | import torch 23 | 24 | from pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME, 25 | OpenAIGPTConfig, 26 | OpenAIGPTModel, 27 | load_tf_weights_in_openai_gpt) 28 | 29 | 30 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): 31 | # Construct model 32 | if openai_config_file == "": 33 | config = OpenAIGPTConfig() 34 | else: 35 | config = OpenAIGPTConfig(openai_config_file) 36 | model = OpenAIGPTModel(config) 37 | 38 | # Load weights from numpy 39 | load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path) 40 | 41 | # Save pytorch-model 42 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 43 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 44 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 45 | torch.save(model.state_dict(), pytorch_weights_dump_path) 46 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 47 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 48 | f.write(config.to_json_string()) 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | ## Required parameters 54 | parser.add_argument("--openai_checkpoint_folder_path", 55 | default = None, 56 | type = str, 57 | required = True, 58 | help = "Path the TensorFlow checkpoint path.") 59 | parser.add_argument("--pytorch_dump_folder_path", 60 | default = None, 61 | type = str, 62 | required = True, 63 | help = "Path to the output PyTorch model.") 64 | parser.add_argument("--openai_config_file", 65 | default = "", 66 | type = str, 67 | help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" 68 | "This specifies the model architecture.") 69 | args = parser.parse_args() 70 | convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path, 71 | args.openai_config_file, 72 | args.pytorch_dump_folder_path) 73 | -------------------------------------------------------------------------------- /tests/tokenization_transfo_xl_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | from io import open 20 | 21 | from pytorch_pretrained_bert.tokenization_transfo_xl import (TransfoXLTokenizer, 22 | _is_control, _is_punctuation, 23 | _is_whitespace) 24 | 25 | 26 | class TransfoXLTokenizationTest(unittest.TestCase): 27 | 28 | def test_full_tokenizer(self): 29 | vocab_tokens = [ 30 | "", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un", "running", "," 31 | ] 32 | with open("/tmp/transfo_xl_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer: 33 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) 34 | vocab_file = vocab_writer.name 35 | 36 | tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True) 37 | tokenizer.build_vocab() 38 | os.remove(vocab_file) 39 | 40 | tokens = tokenizer.tokenize(u" UNwant\u00E9d,running") 41 | self.assertListEqual(tokens, ["", "unwanted", ",", "running"]) 42 | 43 | self.assertListEqual( 44 | tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7]) 45 | 46 | def test_full_tokenizer_lower(self): 47 | tokenizer = TransfoXLTokenizer(lower_case=True) 48 | 49 | self.assertListEqual( 50 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), 51 | ["hello", "!", "how", "are", "you", "?"]) 52 | self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 53 | 54 | def test_full_tokenizer_no_lower(self): 55 | tokenizer = TransfoXLTokenizer(lower_case=False) 56 | 57 | self.assertListEqual( 58 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), 59 | ["HeLLo", "!", "how", "Are", "yoU", "?"]) 60 | 61 | def test_is_whitespace(self): 62 | self.assertTrue(_is_whitespace(u" ")) 63 | self.assertTrue(_is_whitespace(u"\t")) 64 | self.assertTrue(_is_whitespace(u"\r")) 65 | self.assertTrue(_is_whitespace(u"\n")) 66 | self.assertTrue(_is_whitespace(u"\u00A0")) 67 | 68 | self.assertFalse(_is_whitespace(u"A")) 69 | self.assertFalse(_is_whitespace(u"-")) 70 | 71 | def test_is_control(self): 72 | self.assertTrue(_is_control(u"\u0005")) 73 | 74 | self.assertFalse(_is_control(u"A")) 75 | self.assertFalse(_is_control(u" ")) 76 | self.assertFalse(_is_control(u"\t")) 77 | self.assertFalse(_is_control(u"\r")) 78 | 79 | def test_is_punctuation(self): 80 | self.assertTrue(_is_punctuation(u"-")) 81 | self.assertTrue(_is_punctuation(u"$")) 82 | self.assertTrue(_is_punctuation(u"`")) 83 | self.assertTrue(_is_punctuation(u".")) 84 | 85 | self.assertFalse(_is_punctuation(u"A")) 86 | self.assertFalse(_is_punctuation(u" ")) 87 | 88 | 89 | if __name__ == '__main__': 90 | unittest.main() 91 | -------------------------------------------------------------------------------- /samples/sample_text.txt: -------------------------------------------------------------------------------- 1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত 2 | Text should be one-sentence-per-line, with empty lines between documents. 3 | This sample text is public domain and was randomly selected from Project Guttenberg. 4 | 5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors. 6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity. 7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them. 8 | "Cass" Beard had risen early that morning, but not with a view to discovery. 9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets. 10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency. 11 | This was nearly opposite. 12 | Mr. Cassius crossed the highway, and stopped suddenly. 13 | Something glittered in the nearest red pool before him. 14 | Gold, surely! 15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring. 16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass." 17 | Like most of his fellow gold-seekers, Cass was superstitious. 18 | 19 | The fountain of classic wisdom, Hypatia herself. 20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge. 21 | From my youth I felt in me a soul above the matter-entangled herd. 22 | She revealed to me the glorious fact, that I am a spark of Divinity itself. 23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's. 24 | There is a philosophic pleasure in opening one's treasures to the modest young. 25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street. 26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide; 27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind. 28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now. 29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert; 30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts. 31 | At last they reached the quay at the opposite end of the street; 32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers. 33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him. 34 | -------------------------------------------------------------------------------- /pytorch_pretrained_bert/__main__.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | def main(): 3 | import sys 4 | if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [ 5 | "convert_tf_checkpoint_to_pytorch", 6 | "convert_openai_checkpoint", 7 | "convert_transfo_xl_checkpoint", 8 | "convert_gpt2_checkpoint", 9 | ]: 10 | print( 11 | "Should be used as one of: \n" 12 | ">> `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n" 13 | ">> `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n" 14 | ">> `pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n" 15 | ">> `pytorch_pretrained_bert convert_gpt2_checkpoint TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]`") 16 | else: 17 | if sys.argv[1] == "convert_tf_checkpoint_to_pytorch": 18 | try: 19 | from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch 20 | except ImportError: 21 | print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " 22 | "In that case, it requires TensorFlow to be installed. Please see " 23 | "https://www.tensorflow.org/install/ for installation instructions.") 24 | raise 25 | 26 | if len(sys.argv) != 5: 27 | # pylint: disable=line-too-long 28 | print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") 29 | else: 30 | PYTORCH_DUMP_OUTPUT = sys.argv.pop() 31 | TF_CONFIG = sys.argv.pop() 32 | TF_CHECKPOINT = sys.argv.pop() 33 | convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) 34 | elif sys.argv[1] == "convert_openai_checkpoint": 35 | from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch 36 | OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2] 37 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 38 | if len(sys.argv) == 5: 39 | OPENAI_GPT_CONFIG = sys.argv[4] 40 | else: 41 | OPENAI_GPT_CONFIG = "" 42 | convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH, 43 | OPENAI_GPT_CONFIG, 44 | PYTORCH_DUMP_OUTPUT) 45 | elif sys.argv[1] == "convert_transfo_xl_checkpoint": 46 | try: 47 | from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch 48 | except ImportError: 49 | print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " 50 | "In that case, it requires TensorFlow to be installed. Please see " 51 | "https://www.tensorflow.org/install/ for installation instructions.") 52 | raise 53 | 54 | if 'ckpt' in sys.argv[2].lower(): 55 | TF_CHECKPOINT = sys.argv[2] 56 | TF_DATASET_FILE = "" 57 | else: 58 | TF_DATASET_FILE = sys.argv[2] 59 | TF_CHECKPOINT = "" 60 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 61 | if len(sys.argv) == 5: 62 | TF_CONFIG = sys.argv[4] 63 | else: 64 | TF_CONFIG = "" 65 | convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE) 66 | else: 67 | try: 68 | from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch 69 | except ImportError: 70 | print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " 71 | "In that case, it requires TensorFlow to be installed. Please see " 72 | "https://www.tensorflow.org/install/ for installation instructions.") 73 | raise 74 | 75 | TF_CHECKPOINT = sys.argv[2] 76 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 77 | if len(sys.argv) == 5: 78 | TF_CONFIG = sys.argv[4] 79 | else: 80 | TF_CONFIG = "" 81 | convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /tests/tokenization_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | from io import open 20 | 21 | from pytorch_pretrained_bert.tokenization import (BasicTokenizer, 22 | BertTokenizer, 23 | WordpieceTokenizer, 24 | _is_control, _is_punctuation, 25 | _is_whitespace) 26 | 27 | 28 | class TokenizationTest(unittest.TestCase): 29 | 30 | def test_full_tokenizer(self): 31 | vocab_tokens = [ 32 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", 33 | "##ing", "," 34 | ] 35 | with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer: 36 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) 37 | 38 | vocab_file = vocab_writer.name 39 | 40 | tokenizer = BertTokenizer(vocab_file) 41 | os.remove(vocab_file) 42 | 43 | tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") 44 | self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) 45 | 46 | self.assertListEqual( 47 | tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) 48 | 49 | def test_chinese(self): 50 | tokenizer = BasicTokenizer() 51 | 52 | self.assertListEqual( 53 | tokenizer.tokenize(u"ah\u535A\u63A8zz"), 54 | [u"ah", u"\u535A", u"\u63A8", u"zz"]) 55 | 56 | def test_basic_tokenizer_lower(self): 57 | tokenizer = BasicTokenizer(do_lower_case=True) 58 | 59 | self.assertListEqual( 60 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), 61 | ["hello", "!", "how", "are", "you", "?"]) 62 | self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 63 | 64 | def test_basic_tokenizer_no_lower(self): 65 | tokenizer = BasicTokenizer(do_lower_case=False) 66 | 67 | self.assertListEqual( 68 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), 69 | ["HeLLo", "!", "how", "Are", "yoU", "?"]) 70 | 71 | def test_wordpiece_tokenizer(self): 72 | vocab_tokens = [ 73 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", 74 | "##ing" 75 | ] 76 | 77 | vocab = {} 78 | for (i, token) in enumerate(vocab_tokens): 79 | vocab[token] = i 80 | tokenizer = WordpieceTokenizer(vocab=vocab) 81 | 82 | self.assertListEqual(tokenizer.tokenize(""), []) 83 | 84 | self.assertListEqual( 85 | tokenizer.tokenize("unwanted running"), 86 | ["un", "##want", "##ed", "runn", "##ing"]) 87 | 88 | self.assertListEqual( 89 | tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) 90 | 91 | def test_is_whitespace(self): 92 | self.assertTrue(_is_whitespace(u" ")) 93 | self.assertTrue(_is_whitespace(u"\t")) 94 | self.assertTrue(_is_whitespace(u"\r")) 95 | self.assertTrue(_is_whitespace(u"\n")) 96 | self.assertTrue(_is_whitespace(u"\u00A0")) 97 | 98 | self.assertFalse(_is_whitespace(u"A")) 99 | self.assertFalse(_is_whitespace(u"-")) 100 | 101 | def test_is_control(self): 102 | self.assertTrue(_is_control(u"\u0005")) 103 | 104 | self.assertFalse(_is_control(u"A")) 105 | self.assertFalse(_is_control(u" ")) 106 | self.assertFalse(_is_control(u"\t")) 107 | self.assertFalse(_is_control(u"\r")) 108 | 109 | def test_is_punctuation(self): 110 | self.assertTrue(_is_punctuation(u"-")) 111 | self.assertTrue(_is_punctuation(u"$")) 112 | self.assertTrue(_is_punctuation(u"`")) 113 | self.assertTrue(_is_punctuation(u".")) 114 | 115 | self.assertFalse(_is_punctuation(u"A")) 116 | self.assertFalse(_is_punctuation(u" ")) 117 | 118 | 119 | if __name__ == '__main__': 120 | unittest.main() 121 | -------------------------------------------------------------------------------- /examples/run_gpt2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import logging 5 | from tqdm import trange 6 | 7 | import torch 8 | import torch.nn.functional as F 9 | import numpy as np 10 | 11 | from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer 12 | 13 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 14 | datefmt = '%m/%d/%Y %H:%M:%S', 15 | level = logging.INFO) 16 | logger = logging.getLogger(__name__) 17 | 18 | def top_k_logits(logits, k): 19 | """ 20 | Masks everything but the k top entries as -infinity (1e10). 21 | Used to mask logits such that e^-infinity -> 0 won't contribute to the 22 | sum of the denominator. 23 | """ 24 | if k == 0: 25 | return logits 26 | else: 27 | values = torch.topk(logits, k)[0] 28 | batch_mins = values[:, -1].view(-1, 1).expand_as(logits) 29 | return torch.where(logits < batch_mins, torch.ones_like(logits) * -1e10, logits) 30 | 31 | def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda', sample=True): 32 | if start_token is None: 33 | assert context is not None, 'Specify exactly one of start_token and context!' 34 | context = torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1) 35 | else: 36 | assert context is None, 'Specify exactly one of start_token and context!' 37 | context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long) 38 | prev = context 39 | output = context 40 | past = None 41 | with torch.no_grad(): 42 | for i in trange(length): 43 | logits, past = model(prev, past=past) 44 | logits = logits[:, -1, :] / temperature 45 | logits = top_k_logits(logits, k=top_k) 46 | log_probs = F.softmax(logits, dim=-1) 47 | if sample: 48 | prev = torch.multinomial(log_probs, num_samples=1) 49 | else: 50 | _, prev = torch.topk(log_probs, k=1, dim=-1) 51 | output = torch.cat((output, prev), dim=1) 52 | return output 53 | 54 | def run_model(): 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint') 57 | parser.add_argument("--seed", type=int, default=0) 58 | parser.add_argument("--nsamples", type=int, default=1) 59 | parser.add_argument("--batch_size", type=int, default=-1) 60 | parser.add_argument("--length", type=int, default=-1) 61 | parser.add_argument("--temperature", type=int, default=1) 62 | parser.add_argument("--top_k", type=int, default=0) 63 | parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') 64 | args = parser.parse_args() 65 | print(args) 66 | 67 | if args.batch_size == -1: 68 | args.batch_size = 1 69 | assert args.nsamples % args.batch_size == 0 70 | 71 | np.random.seed(args.seed) 72 | torch.random.manual_seed(args.seed) 73 | torch.cuda.manual_seed(args.seed) 74 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 75 | 76 | enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path) 77 | model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path) 78 | model.to(device) 79 | model.eval() 80 | 81 | if args.length == -1: 82 | args.length = model.config.n_ctx // 2 83 | elif args.length > model.config.n_ctx: 84 | raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) 85 | 86 | while not args.unconditional: 87 | if not args.unconditional: 88 | raw_text = input("Model prompt >>> ") 89 | while not raw_text: 90 | print('Prompt should not be empty!') 91 | raw_text = input("Model prompt >>> ") 92 | context_tokens = enc.encode(raw_text) 93 | generated = 0 94 | for _ in range(args.nsamples // args.batch_size): 95 | out = sample_sequence( 96 | model=model, length=args.length, 97 | context=context_tokens if not args.unconditional else None, 98 | start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None, 99 | batch_size=args.batch_size, 100 | temperature=args.temperature, top_k=args.top_k, device=device 101 | ) 102 | out = out[:, len(context_tokens):].tolist() 103 | for i in range(args.batch_size): 104 | generated += 1 105 | text = enc.decode(out[i]) 106 | print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) 107 | print(text) 108 | print("=" * 80) 109 | 110 | if __name__ == '__main__': 111 | run_model() 112 | 113 | -------------------------------------------------------------------------------- /pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert Transformer XL checkpoint and datasets.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import os 21 | import sys 22 | from io import open 23 | 24 | import torch 25 | 26 | import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils 27 | from pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME, 28 | WEIGHTS_NAME, 29 | TransfoXLConfig, 30 | TransfoXLLMHeadModel, 31 | load_tf_weights_in_transfo_xl) 32 | from pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME, 33 | VOCAB_NAME) 34 | 35 | if sys.version_info[0] == 2: 36 | import cPickle as pickle 37 | else: 38 | import pickle 39 | 40 | # We do this to be able to load python 2 datasets pickles 41 | # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918 42 | data_utils.Vocab = data_utils.TransfoXLTokenizer 43 | data_utils.Corpus = data_utils.TransfoXLCorpus 44 | sys.modules['data_utils'] = data_utils 45 | sys.modules['vocabulary'] = data_utils 46 | 47 | def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path, 48 | transfo_xl_config_file, 49 | pytorch_dump_folder_path, 50 | transfo_xl_dataset_file): 51 | if transfo_xl_dataset_file: 52 | # Convert a pre-processed corpus (see original TensorFlow repo) 53 | with open(transfo_xl_dataset_file, "rb") as fp: 54 | corpus = pickle.load(fp, encoding="latin1") 55 | # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) 56 | pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME 57 | print("Save vocabulary to {}".format(pytorch_vocab_dump_path)) 58 | corpus_vocab_dict = corpus.vocab.__dict__ 59 | torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) 60 | 61 | corpus_dict_no_vocab = corpus.__dict__ 62 | corpus_dict_no_vocab.pop('vocab', None) 63 | pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME 64 | print("Save dataset to {}".format(pytorch_dataset_dump_path)) 65 | torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) 66 | 67 | if tf_checkpoint_path: 68 | # Convert a pre-trained TensorFlow model 69 | config_path = os.path.abspath(transfo_xl_config_file) 70 | tf_path = os.path.abspath(tf_checkpoint_path) 71 | 72 | print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path)) 73 | # Initialise PyTorch model 74 | if transfo_xl_config_file == "": 75 | config = TransfoXLConfig() 76 | else: 77 | config = TransfoXLConfig(transfo_xl_config_file) 78 | print("Building PyTorch model from configuration: {}".format(str(config))) 79 | model = TransfoXLLMHeadModel(config) 80 | 81 | model = load_tf_weights_in_transfo_xl(model, config, tf_path) 82 | # Save pytorch-model 83 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) 84 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) 85 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) 86 | torch.save(model.state_dict(), pytorch_weights_dump_path) 87 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) 88 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 89 | f.write(config.to_json_string()) 90 | 91 | 92 | if __name__ == "__main__": 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument("--pytorch_dump_folder_path", 95 | default = None, 96 | type = str, 97 | required = True, 98 | help = "Path to the folder to store the PyTorch model or dataset/vocab.") 99 | parser.add_argument("--tf_checkpoint_path", 100 | default = "", 101 | type = str, 102 | help = "An optional path to a TensorFlow checkpoint path to be converted.") 103 | parser.add_argument("--transfo_xl_config_file", 104 | default = "", 105 | type = str, 106 | help = "An optional config json file corresponding to the pre-trained BERT model. \n" 107 | "This specifies the model architecture.") 108 | parser.add_argument("--transfo_xl_dataset_file", 109 | default = "", 110 | type = str, 111 | help = "An optional dataset file to be converted in a vocabulary.") 112 | args = parser.parse_args() 113 | convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path, 114 | args.transfo_xl_config_file, 115 | args.pytorch_dump_folder_path, 116 | args.transfo_xl_dataset_file) 117 | -------------------------------------------------------------------------------- /examples/lm_finetuning/README.md: -------------------------------------------------------------------------------- 1 | # BERT Model Finetuning using Masked Language Modeling objective 2 | 3 | ## Introduction 4 | 5 | The three example scripts in this folder can be used to **fine-tune** a pre-trained BERT model using the pretraining objective (combination of masked language modeling and next sentence prediction loss). In general, pretrained models like BERT are first trained with a pretraining objective (masked language modeling and next sentence prediction for BERT) on a large and general natural language corpus. A classifier head is then added on top of the pre-trained architecture and the model is quickly fine-tuned on a target task, while still (hopefully) retaining its general language understanding. This greatly reduces overfitting and yields state-of-the-art results, especially when training data for the target task are limited. 6 | 7 | The [ULMFiT paper](https://arxiv.org/abs/1801.06146) took a slightly different approach, however, and added an intermediate step in which the model is fine-tuned on text **from the same domain as the target task and using the pretraining objective** before the final stage in which the classifier head is added and the model is trained on the target task itself. This paper reported significantly improved results from this step, and found that they could get high-quality classifications even with only tiny numbers (<1000) of labelled training examples, as long as they had a lot of unlabelled data from the target domain. 8 | 9 | The BERT model has more capacity than the LSTM models used in the ULMFiT work, but the [BERT paper](https://arxiv.org/abs/1810.04805) did not test finetuning using the pretraining objective and at the present stage there aren't many examples of this approach being used for Transformer-based language models. As such, it's hard to predict what effect this step will have on final model performance, but it's reasonable to conjecture that this approach can improve the final classification performance, especially when a large unlabelled corpus from the target domain is available, labelled data is limited, or the target domain is very unusual and different from 'normal' English text. If you are aware of any literature on this subject, please feel free to add it in here, or open an issue and tag me (@Rocketknight1) and I'll include it. 10 | 11 | ## Input format 12 | 13 | The scripts in this folder expect a single file as input, consisting of untokenized text, with one **sentence** per line, and one blank line between documents. The reason for the sentence splitting is that part of BERT's training involves a _next sentence_ objective in which the model must predict whether two sequences of text are contiguous text from the same document or not, and to avoid making the task _too easy_, the split point between the sequences is always at the end of a sentence. The linebreaks in the file are therefore necessary to mark the points where the text can be split. 14 | 15 | ## Usage 16 | 17 | There are two ways to fine-tune a language model using these scripts. The first _quick_ approach is to use [`simple_lm_finetuning.py`](./simple_lm_finetuning.py). This script does everything in a single script, but generates training instances that consist of just two sentences. This is quite different from the BERT paper, where (confusingly) the NextSentence task concatenated sentences together from each document to form two long multi-sentences, which the paper just referred to as _sentences_. The difference between this simple approach and the original paper approach can have a significant effect for long sequences since two sentences will be much shorter than the max sequence length. In this case, most of each training example will just consist of blank padding characters, which wastes a lot of computation and results in a model that isn't really training on long sequences. 18 | 19 | As such, the preferred approach (assuming you have documents containing multiple contiguous sentences from your target domain) is to use [`pregenerate_training_data.py`](./pregenerate_training_data.py) to pre-process your data into training examples following the methodology used for LM training in the original BERT paper and repository. Since there is a significant random component to training data generation for BERT, this script includes an option to generate multiple _epochs_ of pre-processed data, to avoid training on the same random splits each epoch. Generating an epoch of data for each training epoch should result a better final model, and so we recommend doing so. 20 | 21 | You can then train on the pregenerated data using [`finetune_on_pregenerated.py`](./finetune_on_pregenerated.py), and pointing it to the folder created by [`pregenerate_training_data.py`](./pregenerate_training_data.py). Note that you should use the same `bert_model` and case options for both! Also note that `max_seq_len` does not need to be specified for the [`finetune_on_pregenerated.py`](./finetune_on_pregenerated.py) script, as it is inferred from the training examples. 22 | 23 | There are various options that can be tweaked, but they are mostly set to the values from the BERT paper/repository and default values should make sense. The most relevant ones are: 24 | 25 | - `--max_seq_len`: Controls the length of training examples (in wordpiece tokens) seen by the model. Defaults to 128 but can be set as high as 512. Higher values may yield stronger language models at the cost of slower and more memory-intensive training. 26 | - `--fp16`: Enables fast half-precision training on recent GPUs. 27 | 28 | In addition, if memory usage is an issue, especially when training on a single GPU, reducing `--train_batch_size` from the default 32 to a lower number (4-16) can be helpful, or leaving `--train_batch_size` at the default and increasing `--gradient_accumulation_steps` to 2-8. Changing `--gradient_accumulation_steps` may be preferable as alterations to the batch size may require corresponding changes in the learning rate to compensate. There is also a `--reduce_memory` option for both the `pregenerate_training_data.py` and `finetune_on_pregenerated.py` scripts that spills data to disc in shelf objects or numpy memmaps rather than retaining it in memory, which significantly reduces memory usage with little performance impact. 29 | 30 | ## Examples 31 | 32 | ### Simple fine-tuning 33 | 34 | ``` 35 | python3 simple_lm_finetuning.py 36 | --train_corpus my_corpus.txt 37 | --bert_model bert-base-uncased 38 | --do_lower_case 39 | --output_dir finetuned_lm/ 40 | ``` 41 | 42 | ### Pregenerating training data 43 | 44 | ``` 45 | python3 pregenerate_training_data.py 46 | --train_corpus my_corpus.txt 47 | --bert_model bert-base-uncased 48 | --do_lower_case 49 | --output_dir training/ 50 | --epochs_to_generate 3 51 | --max_seq_len 256 52 | ``` 53 | 54 | ### Training on pregenerated data 55 | 56 | ``` 57 | python3 finetune_on_pregenerated.py 58 | --pregenerated_data training/ 59 | --bert_model bert-base-uncased 60 | --do_lower_case 61 | --output_dir finetuned_lm/ 62 | --epochs 3 63 | ``` -------------------------------------------------------------------------------- /examples/run_transfo_xl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ PyTorch Transformer XL model evaluation script. 17 | Adapted from https://github.com/kimiyoung/transformer-xl. 18 | In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py 19 | 20 | This script with default values evaluates a pretrained Transformer-XL on WikiText 103 21 | """ 22 | from __future__ import absolute_import, division, print_function, unicode_literals 23 | 24 | import argparse 25 | import logging 26 | import time 27 | import math 28 | 29 | import torch 30 | 31 | from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus 32 | 33 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 34 | datefmt = '%m/%d/%Y %H:%M:%S', 35 | level = logging.INFO) 36 | logger = logging.getLogger(__name__) 37 | 38 | def main(): 39 | parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') 40 | parser.add_argument('--model_name', type=str, default='transfo-xl-wt103', 41 | help='pretrained model name') 42 | parser.add_argument('--split', type=str, default='test', 43 | choices=['all', 'valid', 'test'], 44 | help='which split to evaluate') 45 | parser.add_argument('--batch_size', type=int, default=10, 46 | help='batch size') 47 | parser.add_argument('--tgt_len', type=int, default=128, 48 | help='number of tokens to predict') 49 | parser.add_argument('--ext_len', type=int, default=0, 50 | help='length of the extended context') 51 | parser.add_argument('--mem_len', type=int, default=1600, 52 | help='length of the retained previous heads') 53 | parser.add_argument('--clamp_len', type=int, default=1000, 54 | help='max positional embedding index') 55 | parser.add_argument('--no_cuda', action='store_true', 56 | help='Do not use CUDA even though CUA is available') 57 | parser.add_argument('--work_dir', type=str, required=True, 58 | help='path to the work_dir') 59 | parser.add_argument('--no_log', action='store_true', 60 | help='do not log the eval result') 61 | parser.add_argument('--same_length', action='store_true', 62 | help='set same length attention with masking') 63 | parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") 64 | parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") 65 | args = parser.parse_args() 66 | assert args.ext_len >= 0, 'extended context length must be non-negative' 67 | 68 | if args.server_ip and args.server_port: 69 | # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script 70 | import ptvsd 71 | print("Waiting for debugger attach") 72 | ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) 73 | ptvsd.wait_for_attach() 74 | 75 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 76 | logger.info("device: {}".format(device)) 77 | 78 | # Load a pre-processed dataset 79 | # You can also build the corpus yourself using TransfoXLCorpus methods 80 | # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax 81 | # and tokenizing the dataset 82 | # The pre-processed corpus is a convertion (using the conversion script ) 83 | corpus = TransfoXLCorpus.from_pretrained(args.model_name) 84 | ntokens = len(corpus.vocab) 85 | 86 | va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len, 87 | device=device, ext_len=args.ext_len) 88 | te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len, 89 | device=device, ext_len=args.ext_len) 90 | 91 | # Load a pre-trained model 92 | model = TransfoXLLMHeadModel.from_pretrained(args.model_name) 93 | model = model.to(device) 94 | 95 | logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format( 96 | args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len)) 97 | 98 | model.reset_length(args.tgt_len, args.ext_len, args.mem_len) 99 | if args.clamp_len > 0: 100 | model.clamp_len = args.clamp_len 101 | if args.same_length: 102 | model.same_length = True 103 | 104 | ############################################################################### 105 | # Evaluation code 106 | ############################################################################### 107 | def evaluate(eval_iter): 108 | # Turn on evaluation mode which disables dropout. 109 | model.eval() 110 | total_len, total_loss = 0, 0. 111 | start_time = time.time() 112 | with torch.no_grad(): 113 | mems = None 114 | for idx, (data, target, seq_len) in enumerate(eval_iter): 115 | ret = model(data, target, mems) 116 | loss, mems = ret 117 | loss = loss.mean() 118 | total_loss += seq_len * loss.item() 119 | total_len += seq_len 120 | total_time = time.time() - start_time 121 | logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format( 122 | total_time, 1000 * total_time / (idx+1))) 123 | return total_loss / total_len 124 | 125 | # Run on test data. 126 | if args.split == 'all': 127 | test_loss = evaluate(te_iter) 128 | valid_loss = evaluate(va_iter) 129 | elif args.split == 'valid': 130 | valid_loss = evaluate(va_iter) 131 | test_loss = None 132 | elif args.split == 'test': 133 | test_loss = evaluate(te_iter) 134 | valid_loss = None 135 | 136 | def format_log(loss, split): 137 | log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format( 138 | split, loss, math.exp(loss)) 139 | return log_str 140 | 141 | log_str = '' 142 | if valid_loss is not None: 143 | log_str += format_log(valid_loss, 'valid') 144 | if test_loss is not None: 145 | log_str += format_log(test_loss, 'test') 146 | 147 | logger.info('=' * 100) 148 | logger.info(log_str) 149 | logger.info('=' * 100) 150 | 151 | if __name__ == '__main__': 152 | main() 153 | -------------------------------------------------------------------------------- /pytorch_pretrained_bert/optimization_openai.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch optimization for OpenAI GPT model.""" 16 | 17 | import math 18 | import torch 19 | from torch.optim import Optimizer 20 | from torch.optim.optimizer import required 21 | from torch.nn.utils import clip_grad_norm_ 22 | import logging 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | def warmup_cosine(x, warmup=0.002): 27 | if x < warmup: 28 | return x/warmup 29 | x_ = (x - warmup) / (1 - warmup) # progress after warmup 30 | return 0.5 * (1. + math.cos(math.pi * x_)) 31 | 32 | def warmup_constant(x, warmup=0.002): 33 | """ Linearly increases learning rate over `warmup`*`t_total` (as provided to OpenAIAdam) training steps. 34 | Learning rate is 1. afterwards. """ 35 | if x < warmup: 36 | return x/warmup 37 | return 1.0 38 | 39 | def warmup_linear(x, warmup=0.002): 40 | """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to OpenAIAdam) training step. 41 | After `t_total`-th training step, learning rate is zero. """ 42 | if x < warmup: 43 | return x/warmup 44 | return max((x-1.)/(warmup-1.), 0) 45 | 46 | SCHEDULES = { 47 | 'warmup_cosine':warmup_cosine, 48 | 'warmup_constant':warmup_constant, 49 | 'warmup_linear':warmup_linear, 50 | } 51 | 52 | 53 | class OpenAIAdam(Optimizer): 54 | """Implements Open AI version of Adam algorithm with weight decay fix. 55 | """ 56 | def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1, 57 | b1=0.9, b2=0.999, e=1e-8, weight_decay=0, 58 | vector_l2=False, max_grad_norm=-1, **kwargs): 59 | if lr is not required and lr < 0.0: 60 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 61 | if schedule not in SCHEDULES: 62 | raise ValueError("Invalid schedule parameter: {}".format(schedule)) 63 | if not 0.0 <= warmup < 1.0 and not warmup == -1: 64 | raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) 65 | if not 0.0 <= b1 < 1.0: 66 | raise ValueError("Invalid b1 parameter: {}".format(b1)) 67 | if not 0.0 <= b2 < 1.0: 68 | raise ValueError("Invalid b2 parameter: {}".format(b2)) 69 | if not e >= 0.0: 70 | raise ValueError("Invalid epsilon value: {}".format(e)) 71 | defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, 72 | b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2, 73 | max_grad_norm=max_grad_norm) 74 | super(OpenAIAdam, self).__init__(params, defaults) 75 | 76 | def get_lr(self): 77 | lr = [] 78 | for group in self.param_groups: 79 | for p in group['params']: 80 | state = self.state[p] 81 | if len(state) == 0: 82 | return [0] 83 | if group['t_total'] != -1: 84 | schedule_fct = SCHEDULES[group['schedule']] 85 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) 86 | else: 87 | lr_scheduled = group['lr'] 88 | lr.append(lr_scheduled) 89 | return lr 90 | 91 | def step(self, closure=None): 92 | """Performs a single optimization step. 93 | 94 | Arguments: 95 | closure (callable, optional): A closure that reevaluates the model 96 | and returns the loss. 97 | """ 98 | loss = None 99 | if closure is not None: 100 | loss = closure() 101 | 102 | warned_for_t_total = False 103 | 104 | for group in self.param_groups: 105 | for p in group['params']: 106 | if p.grad is None: 107 | continue 108 | grad = p.grad.data 109 | if grad.is_sparse: 110 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 111 | 112 | state = self.state[p] 113 | 114 | # State initialization 115 | if len(state) == 0: 116 | state['step'] = 0 117 | # Exponential moving average of gradient values 118 | state['exp_avg'] = torch.zeros_like(p.data) 119 | # Exponential moving average of squared gradient values 120 | state['exp_avg_sq'] = torch.zeros_like(p.data) 121 | 122 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 123 | beta1, beta2 = group['b1'], group['b2'] 124 | 125 | state['step'] += 1 126 | 127 | # Add grad clipping 128 | if group['max_grad_norm'] > 0: 129 | clip_grad_norm_(p, group['max_grad_norm']) 130 | 131 | # Decay the first and second moment running average coefficient 132 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 133 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 134 | denom = exp_avg_sq.sqrt().add_(group['e']) 135 | 136 | bias_correction1 = 1 - beta1 ** state['step'] 137 | bias_correction2 = 1 - beta2 ** state['step'] 138 | 139 | if group['t_total'] != -1: 140 | schedule_fct = SCHEDULES[group['schedule']] 141 | progress = state['step']/group['t_total'] 142 | lr_scheduled = group['lr'] * schedule_fct(progress, group['warmup']) 143 | # warning for exceeding t_total (only active with warmup_linear 144 | if group['schedule'] == "warmup_linear" and progress > 1. and not warned_for_t_total: 145 | logger.warning( 146 | "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. " 147 | "Please set 't_total' of {} correctly.".format(group['schedule'], lr_scheduled, self.__class__.__name__)) 148 | warned_for_t_total = True 149 | # end warning 150 | else: 151 | lr_scheduled = group['lr'] 152 | 153 | step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 154 | 155 | p.data.addcdiv_(-step_size, exp_avg, denom) 156 | 157 | # Add weight decay at the end (fixed version) 158 | if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0: 159 | p.data.add_(-lr_scheduled * group['weight_decay'], p.data) 160 | 161 | return loss 162 | -------------------------------------------------------------------------------- /pytorch_pretrained_bert/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch optimization for BERT model.""" 16 | 17 | import math 18 | import torch 19 | from torch.optim import Optimizer 20 | from torch.optim.optimizer import required 21 | from torch.nn.utils import clip_grad_norm_ 22 | import logging 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | def warmup_cosine(x, warmup=0.002): 27 | if x < warmup: 28 | return x/warmup 29 | x_ = (x - warmup) / (1 - warmup) # progress after warmup - 30 | return 0.5 * (1. + math.cos(math.pi * x_)) 31 | 32 | def warmup_constant(x, warmup=0.002): 33 | """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps. 34 | Learning rate is 1. afterwards. """ 35 | if x < warmup: 36 | return x/warmup 37 | return 1.0 38 | 39 | def warmup_linear(x, warmup=0.002): 40 | """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step. 41 | After `t_total`-th training step, learning rate is zero. """ 42 | if x < warmup: 43 | return x/warmup 44 | return max((x-1.)/(warmup-1.), 0) 45 | 46 | SCHEDULES = { 47 | 'warmup_cosine': warmup_cosine, 48 | 'warmup_constant': warmup_constant, 49 | 'warmup_linear': warmup_linear, 50 | } 51 | 52 | 53 | class BertAdam(Optimizer): 54 | """Implements BERT version of Adam algorithm with weight decay fix. 55 | Params: 56 | lr: learning rate 57 | warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 58 | t_total: total number of training steps for the learning 59 | rate schedule, -1 means constant learning rate. Default: -1 60 | schedule: schedule to use for the warmup (see above). Default: 'warmup_linear' 61 | b1: Adams b1. Default: 0.9 62 | b2: Adams b2. Default: 0.999 63 | e: Adams epsilon. Default: 1e-6 64 | weight_decay: Weight decay. Default: 0.01 65 | max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 66 | """ 67 | def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', 68 | b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, 69 | max_grad_norm=1.0): 70 | if lr is not required and lr < 0.0: 71 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 72 | if schedule not in SCHEDULES: 73 | raise ValueError("Invalid schedule parameter: {}".format(schedule)) 74 | if not 0.0 <= warmup < 1.0 and not warmup == -1: 75 | raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) 76 | if not 0.0 <= b1 < 1.0: 77 | raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) 78 | if not 0.0 <= b2 < 1.0: 79 | raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) 80 | if not e >= 0.0: 81 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) 82 | defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, 83 | b1=b1, b2=b2, e=e, weight_decay=weight_decay, 84 | max_grad_norm=max_grad_norm) 85 | super(BertAdam, self).__init__(params, defaults) 86 | 87 | def get_lr(self): 88 | lr = [] 89 | for group in self.param_groups: 90 | for p in group['params']: 91 | state = self.state[p] 92 | if len(state) == 0: 93 | return [0] 94 | if group['t_total'] != -1: 95 | schedule_fct = SCHEDULES[group['schedule']] 96 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) 97 | else: 98 | lr_scheduled = group['lr'] 99 | lr.append(lr_scheduled) 100 | return lr 101 | 102 | def step(self, closure=None): 103 | """Performs a single optimization step. 104 | 105 | Arguments: 106 | closure (callable, optional): A closure that reevaluates the model 107 | and returns the loss. 108 | """ 109 | loss = None 110 | if closure is not None: 111 | loss = closure() 112 | 113 | warned_for_t_total = False 114 | 115 | for group in self.param_groups: 116 | for p in group['params']: 117 | if p.grad is None: 118 | continue 119 | grad = p.grad.data 120 | if grad.is_sparse: 121 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 122 | 123 | state = self.state[p] 124 | 125 | # State initialization 126 | if len(state) == 0: 127 | state['step'] = 0 128 | # Exponential moving average of gradient values 129 | state['next_m'] = torch.zeros_like(p.data) 130 | # Exponential moving average of squared gradient values 131 | state['next_v'] = torch.zeros_like(p.data) 132 | 133 | next_m, next_v = state['next_m'], state['next_v'] 134 | beta1, beta2 = group['b1'], group['b2'] 135 | 136 | # Add grad clipping 137 | if group['max_grad_norm'] > 0: 138 | clip_grad_norm_(p, group['max_grad_norm']) 139 | 140 | # Decay the first and second moment running average coefficient 141 | # In-place operations to update the averages at the same time 142 | next_m.mul_(beta1).add_(1 - beta1, grad) 143 | next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) 144 | update = next_m / (next_v.sqrt() + group['e']) 145 | 146 | # Just adding the square of the weights to the loss function is *not* 147 | # the correct way of using L2 regularization/weight decay with Adam, 148 | # since that will interact with the m and v parameters in strange ways. 149 | # 150 | # Instead we want to decay the weights in a manner that doesn't interact 151 | # with the m/v parameters. This is equivalent to adding the square 152 | # of the weights to the loss with plain (non-momentum) SGD. 153 | if group['weight_decay'] > 0.0: 154 | update += group['weight_decay'] * p.data 155 | 156 | if group['t_total'] != -1: 157 | schedule_fct = SCHEDULES[group['schedule']] 158 | progress = state['step']/group['t_total'] 159 | lr_scheduled = group['lr'] * schedule_fct(progress, group['warmup']) 160 | # warning for exceeding t_total (only active with warmup_linear 161 | if group['schedule'] == "warmup_linear" and progress > 1. and not warned_for_t_total: 162 | logger.warning( 163 | "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. " 164 | "Please set 't_total' of {} correctly.".format(group['schedule'], lr_scheduled, self.__class__.__name__)) 165 | warned_for_t_total = True 166 | # end warning 167 | else: 168 | lr_scheduled = group['lr'] 169 | 170 | update_with_lr = lr_scheduled * update 171 | p.data.add_(-update_with_lr) 172 | 173 | state['step'] += 1 174 | 175 | # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 176 | # No bias correction 177 | # bias_correction1 = 1 - beta1 ** state['step'] 178 | # bias_correction2 = 1 - beta2 ** state['step'] 179 | 180 | return loss 181 | -------------------------------------------------------------------------------- /tests/modeling_gpt2_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import json 21 | import random 22 | 23 | import torch 24 | 25 | from pytorch_pretrained_bert import (GPT2Config, GPT2Model, 26 | GPT2LMHeadModel, GPT2DoubleHeadsModel) 27 | 28 | 29 | class GPT2ModelTest(unittest.TestCase): 30 | class GPT2ModelTester(object): 31 | 32 | def __init__(self, 33 | parent, 34 | batch_size=13, 35 | seq_length=7, 36 | is_training=True, 37 | use_position_ids=True, 38 | use_token_type_ids=True, 39 | use_labels=True, 40 | vocab_size=99, 41 | n_positions=33, 42 | n_embd=32, 43 | n_layer=5, 44 | n_head=4, 45 | n_choices=3, 46 | type_sequence_label_size=2, 47 | initializer_range=0.02, 48 | num_labels=3, 49 | scope=None): 50 | self.parent = parent 51 | self.batch_size = batch_size 52 | self.seq_length = seq_length 53 | self.is_training = is_training 54 | self.use_position_ids = use_position_ids 55 | self.use_token_type_ids = use_token_type_ids 56 | self.use_labels = use_labels 57 | self.vocab_size = vocab_size 58 | self.n_positions = n_positions 59 | self.n_embd = n_embd 60 | self.n_layer = n_layer 61 | self.n_head = n_head 62 | self.n_choices = n_choices 63 | self.type_sequence_label_size = type_sequence_label_size 64 | self.initializer_range = initializer_range 65 | self.num_labels = num_labels 66 | self.scope = scope 67 | 68 | def prepare_config_and_inputs(self): 69 | input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size) 70 | 71 | position_ids = None 72 | if self.use_position_ids: 73 | position_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions) 74 | 75 | token_type_ids = None 76 | if self.use_token_type_ids: 77 | total_voc = self.vocab_size 78 | token_type_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc) 79 | 80 | mc_labels = None 81 | lm_labels = None 82 | mc_token_ids = None 83 | if self.use_labels: 84 | mc_labels = GPT2ModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size) 85 | lm_labels = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels) 86 | mc_token_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length) 87 | 88 | config = GPT2Config( 89 | vocab_size_or_config_json_file=self.vocab_size, 90 | n_positions=self.n_positions, 91 | n_embd=self.n_embd, 92 | n_layer=self.n_layer, 93 | n_head=self.n_head, 94 | initializer_range=self.initializer_range) 95 | 96 | return (config, input_ids, token_type_ids, position_ids, 97 | mc_labels, lm_labels, mc_token_ids) 98 | 99 | def create_gpt2_model(self, config, input_ids, token_type_ids, position_ids, 100 | mc_labels, lm_labels, mc_token_ids): 101 | model = GPT2Model(config) 102 | model.eval() 103 | hidden_states, presents = model(input_ids, position_ids, token_type_ids) 104 | outputs = { 105 | "hidden_states": hidden_states, 106 | "presents": presents, 107 | } 108 | return outputs 109 | 110 | def check_gpt2_model_output(self, result): 111 | self.parent.assertListEqual( 112 | list(result["hidden_states"].size()), 113 | [self.batch_size, self.n_choices, self.seq_length, self.n_embd]) 114 | 115 | 116 | def create_gpt2_lm_head(self, config, input_ids, token_type_ids, position_ids, 117 | mc_labels, lm_labels, mc_token_ids): 118 | model = GPT2LMHeadModel(config) 119 | model.eval() 120 | loss = model(input_ids, position_ids, token_type_ids, lm_labels) 121 | lm_logits, presents = model(input_ids, position_ids, token_type_ids) 122 | outputs = { 123 | "loss": loss, 124 | "lm_logits": lm_logits, 125 | "presents": presents, 126 | } 127 | return outputs 128 | 129 | def check_gpt2_lm_head_output(self, result): 130 | total_voc = self.vocab_size 131 | self.parent.assertListEqual( 132 | list(result["lm_logits"].size()), 133 | [self.batch_size, self.n_choices, self.seq_length, total_voc]) 134 | 135 | def check_gpt2_lm_head_loss_output(self, result): 136 | self.parent.assertListEqual( 137 | list(result["loss"].size()), 138 | []) 139 | 140 | def create_gpt2_double_heads(self, config, input_ids, token_type_ids, position_ids, 141 | mc_labels, lm_labels, mc_token_ids): 142 | model = GPT2DoubleHeadsModel(config) 143 | model.eval() 144 | loss = model(input_ids, mc_token_ids, 145 | lm_labels=lm_labels, mc_labels=mc_labels, 146 | token_type_ids=token_type_ids, position_ids=position_ids) 147 | lm_logits, mc_logits, presents = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids) 148 | outputs = { 149 | "loss": loss, 150 | "lm_logits": lm_logits, 151 | "mc_logits": mc_logits, 152 | "presents": presents, 153 | } 154 | return outputs 155 | 156 | def check_gpt2_double_heads_output(self, result): 157 | total_voc = self.vocab_size 158 | self.parent.assertListEqual( 159 | list(result["lm_logits"].size()), 160 | [self.batch_size, self.n_choices, self.seq_length, total_voc]) 161 | self.parent.assertListEqual( 162 | list(result["mc_logits"].size()), 163 | [self.batch_size, self.n_choices]) 164 | 165 | def check_gpt2_double_heads_loss_output(self, result): 166 | self.parent.assertListEqual( 167 | [list(l.size()) for l in result["loss"]], 168 | [[], []]) 169 | 170 | def test_default(self): 171 | self.run_tester(GPT2ModelTest.GPT2ModelTester(self)) 172 | 173 | def test_config_to_json_string(self): 174 | config = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37) 175 | obj = json.loads(config.to_json_string()) 176 | self.assertEqual(obj["vocab_size"], 99) 177 | self.assertEqual(obj["n_embd"], 37) 178 | 179 | def run_tester(self, tester): 180 | config_and_inputs = tester.prepare_config_and_inputs() 181 | output_result = tester.create_gpt2_model(*config_and_inputs) 182 | tester.check_gpt2_model_output(output_result) 183 | 184 | output_result = tester.create_gpt2_lm_head(*config_and_inputs) 185 | tester.check_gpt2_lm_head_output(output_result) 186 | tester.check_gpt2_lm_head_loss_output(output_result) 187 | 188 | output_result = tester.create_gpt2_double_heads(*config_and_inputs) 189 | tester.check_gpt2_double_heads_output(output_result) 190 | tester.check_gpt2_double_heads_loss_output(output_result) 191 | 192 | @classmethod 193 | def ids_tensor(cls, shape, vocab_size, rng=None, name=None): 194 | """Creates a random int32 tensor of the shape within the vocab size.""" 195 | if rng is None: 196 | rng = random.Random() 197 | 198 | total_dims = 1 199 | for dim in shape: 200 | total_dims *= dim 201 | 202 | values = [] 203 | for _ in range(total_dims): 204 | values.append(rng.randint(0, vocab_size - 1)) 205 | 206 | return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous() 207 | 208 | 209 | if __name__ == "__main__": 210 | unittest.main() 211 | -------------------------------------------------------------------------------- /pytorch_pretrained_bert/file_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for working with the local dataset cache. 3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp 4 | Copyright by the AllenNLP authors. 5 | """ 6 | from __future__ import (absolute_import, division, print_function, unicode_literals) 7 | 8 | import json 9 | import logging 10 | import os 11 | import shutil 12 | import tempfile 13 | from functools import wraps 14 | from hashlib import sha256 15 | import sys 16 | from io import open 17 | 18 | import boto3 19 | import requests 20 | from botocore.exceptions import ClientError 21 | from tqdm import tqdm 22 | 23 | try: 24 | from urllib.parse import urlparse 25 | except ImportError: 26 | from urlparse import urlparse 27 | 28 | try: 29 | from pathlib import Path 30 | PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', 31 | Path.home() / '.pytorch_pretrained_bert')) 32 | except (AttributeError, ImportError): 33 | PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', 34 | os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert')) 35 | 36 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 37 | 38 | 39 | def url_to_filename(url, etag=None): 40 | """ 41 | Convert `url` into a hashed filename in a repeatable way. 42 | If `etag` is specified, append its hash to the url's, delimited 43 | by a period. 44 | """ 45 | url_bytes = url.encode('utf-8') 46 | url_hash = sha256(url_bytes) 47 | filename = url_hash.hexdigest() 48 | 49 | if etag: 50 | etag_bytes = etag.encode('utf-8') 51 | etag_hash = sha256(etag_bytes) 52 | filename += '.' + etag_hash.hexdigest() 53 | 54 | return filename 55 | 56 | 57 | def filename_to_url(filename, cache_dir=None): 58 | """ 59 | Return the url and etag (which may be ``None``) stored for `filename`. 60 | Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. 61 | """ 62 | if cache_dir is None: 63 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 64 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 65 | cache_dir = str(cache_dir) 66 | 67 | cache_path = os.path.join(cache_dir, filename) 68 | if not os.path.exists(cache_path): 69 | raise EnvironmentError("file {} not found".format(cache_path)) 70 | 71 | meta_path = cache_path + '.json' 72 | if not os.path.exists(meta_path): 73 | raise EnvironmentError("file {} not found".format(meta_path)) 74 | 75 | with open(meta_path, encoding="utf-8") as meta_file: 76 | metadata = json.load(meta_file) 77 | url = metadata['url'] 78 | etag = metadata['etag'] 79 | 80 | return url, etag 81 | 82 | 83 | def cached_path(url_or_filename, cache_dir=None): 84 | """ 85 | Given something that might be a URL (or might be a local path), 86 | determine which. If it's a URL, download the file and cache it, and 87 | return the path to the cached file. If it's already a local path, 88 | make sure the file exists and then return the path. 89 | """ 90 | if cache_dir is None: 91 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 92 | if sys.version_info[0] == 3 and isinstance(url_or_filename, Path): 93 | url_or_filename = str(url_or_filename) 94 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 95 | cache_dir = str(cache_dir) 96 | 97 | parsed = urlparse(url_or_filename) 98 | 99 | if parsed.scheme in ('http', 'https', 's3'): 100 | # URL, so get it from the cache (downloading if necessary) 101 | return get_from_cache(url_or_filename, cache_dir) 102 | elif os.path.exists(url_or_filename): 103 | # File, and it exists. 104 | return url_or_filename 105 | elif parsed.scheme == '': 106 | # File, but it doesn't exist. 107 | raise EnvironmentError("file {} not found".format(url_or_filename)) 108 | else: 109 | # Something unknown 110 | raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) 111 | 112 | 113 | def split_s3_path(url): 114 | """Split a full s3 path into the bucket name and path.""" 115 | parsed = urlparse(url) 116 | if not parsed.netloc or not parsed.path: 117 | raise ValueError("bad s3 path {}".format(url)) 118 | bucket_name = parsed.netloc 119 | s3_path = parsed.path 120 | # Remove '/' at beginning of path. 121 | if s3_path.startswith("/"): 122 | s3_path = s3_path[1:] 123 | return bucket_name, s3_path 124 | 125 | 126 | def s3_request(func): 127 | """ 128 | Wrapper function for s3 requests in order to create more helpful error 129 | messages. 130 | """ 131 | 132 | @wraps(func) 133 | def wrapper(url, *args, **kwargs): 134 | try: 135 | return func(url, *args, **kwargs) 136 | except ClientError as exc: 137 | if int(exc.response["Error"]["Code"]) == 404: 138 | raise EnvironmentError("file {} not found".format(url)) 139 | else: 140 | raise 141 | 142 | return wrapper 143 | 144 | 145 | @s3_request 146 | def s3_etag(url): 147 | """Check ETag on S3 object.""" 148 | s3_resource = boto3.resource("s3") 149 | bucket_name, s3_path = split_s3_path(url) 150 | s3_object = s3_resource.Object(bucket_name, s3_path) 151 | return s3_object.e_tag 152 | 153 | 154 | @s3_request 155 | def s3_get(url, temp_file): 156 | """Pull a file directly from S3.""" 157 | s3_resource = boto3.resource("s3") 158 | bucket_name, s3_path = split_s3_path(url) 159 | s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) 160 | 161 | 162 | def http_get(url, temp_file): 163 | req = requests.get(url, stream=True) 164 | content_length = req.headers.get('Content-Length') 165 | total = int(content_length) if content_length is not None else None 166 | progress = tqdm(unit="B", total=total) 167 | for chunk in req.iter_content(chunk_size=1024): 168 | if chunk: # filter out keep-alive new chunks 169 | progress.update(len(chunk)) 170 | temp_file.write(chunk) 171 | progress.close() 172 | 173 | 174 | def get_from_cache(url, cache_dir=None): 175 | """ 176 | Given a URL, look for the corresponding dataset in the local cache. 177 | If it's not there, download it. Then return the path to the cached file. 178 | """ 179 | if cache_dir is None: 180 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 181 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 182 | cache_dir = str(cache_dir) 183 | 184 | if not os.path.exists(cache_dir): 185 | os.makedirs(cache_dir) 186 | 187 | # Get eTag to add to filename, if it exists. 188 | if url.startswith("s3://"): 189 | etag = s3_etag(url) 190 | else: 191 | response = requests.head(url, allow_redirects=True) 192 | if response.status_code != 200: 193 | raise IOError("HEAD request failed for url {} with status code {}" 194 | .format(url, response.status_code)) 195 | etag = response.headers.get("ETag") 196 | 197 | filename = url_to_filename(url, etag) 198 | 199 | # get cache path to put the file 200 | cache_path = os.path.join(cache_dir, filename) 201 | 202 | if not os.path.exists(cache_path): 203 | # Download to temporary file, then copy to cache dir once finished. 204 | # Otherwise you get corrupt cache entries if the download gets interrupted. 205 | with tempfile.NamedTemporaryFile() as temp_file: 206 | logger.info("%s not found in cache, downloading to %s", url, temp_file.name) 207 | 208 | # GET file object 209 | if url.startswith("s3://"): 210 | s3_get(url, temp_file) 211 | else: 212 | http_get(url, temp_file) 213 | 214 | # we are copying the file before closing it, so flush to avoid truncation 215 | temp_file.flush() 216 | # shutil.copyfileobj() starts at the current position, so go to the start 217 | temp_file.seek(0) 218 | 219 | logger.info("copying %s to cache at %s", temp_file.name, cache_path) 220 | with open(cache_path, 'wb') as cache_file: 221 | shutil.copyfileobj(temp_file, cache_file) 222 | 223 | logger.info("creating metadata file for %s", cache_path) 224 | meta = {'url': url, 'etag': etag} 225 | meta_path = cache_path + '.json' 226 | with open(meta_path, 'w', encoding="utf-8") as meta_file: 227 | json.dump(meta, meta_file) 228 | 229 | logger.info("removing temp file %s", temp_file.name) 230 | 231 | return cache_path 232 | 233 | 234 | def read_set_from_file(filename): 235 | ''' 236 | Extract a de-duped collection (set) of text from a file. 237 | Expected file format is one item per line. 238 | ''' 239 | collection = set() 240 | with open(filename, 'r', encoding='utf-8') as file_: 241 | for line in file_: 242 | collection.add(line.rstrip()) 243 | return collection 244 | 245 | 246 | def get_file_extension(path, dot=True, lower=True): 247 | ext = os.path.splitext(path)[1] 248 | ext = ext if dot else ext[1:] 249 | return ext.lower() if lower else ext 250 | -------------------------------------------------------------------------------- /pytorch_pretrained_bert/tokenization_gpt2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for OpenAI GPT.""" 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import json 20 | import logging 21 | import os 22 | import regex as re 23 | from io import open 24 | 25 | try: 26 | from functools import lru_cache 27 | except ImportError: 28 | # Just a dummy decorator to get the checks to run on python2 29 | # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now. 30 | def lru_cache(): 31 | return lambda func: func 32 | 33 | from .file_utils import cached_path 34 | 35 | logger = logging.getLogger(__name__) 36 | 37 | PRETRAINED_VOCAB_ARCHIVE_MAP = { 38 | 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", 39 | } 40 | PRETRAINED_MERGES_ARCHIVE_MAP = { 41 | 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", 42 | } 43 | PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { 44 | 'gpt2': 1024, 45 | } 46 | VOCAB_NAME = 'vocab.json' 47 | MERGES_NAME = 'merges.txt' 48 | 49 | @lru_cache() 50 | def bytes_to_unicode(): 51 | """ 52 | Returns list of utf-8 byte and a corresponding list of unicode strings. 53 | The reversible bpe codes work on unicode strings. 54 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 55 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 56 | This is a signficant percentage of your normal, say, 32K bpe vocab. 57 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 58 | And avoids mapping to whitespace/control characters the bpe code barfs on. 59 | """ 60 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 61 | cs = bs[:] 62 | n = 0 63 | for b in range(2**8): 64 | if b not in bs: 65 | bs.append(b) 66 | cs.append(2**8+n) 67 | n += 1 68 | cs = [chr(n) for n in cs] 69 | return dict(zip(bs, cs)) 70 | 71 | def get_pairs(word): 72 | """Return set of symbol pairs in a word. 73 | 74 | Word is represented as tuple of symbols (symbols being variable-length strings). 75 | """ 76 | pairs = set() 77 | prev_char = word[0] 78 | for char in word[1:]: 79 | pairs.add((prev_char, char)) 80 | prev_char = char 81 | return pairs 82 | 83 | class GPT2Tokenizer(object): 84 | """ 85 | GPT-2 BPE tokenizer. Peculiarities: 86 | - Byte-level BPE 87 | """ 88 | @classmethod 89 | def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): 90 | """ 91 | Instantiate a PreTrainedBertModel from a pre-trained model file. 92 | Download and cache the pre-trained model file if needed. 93 | """ 94 | if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: 95 | vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] 96 | merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path] 97 | else: 98 | vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) 99 | merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) 100 | # redirect to the cache, if necessary 101 | try: 102 | resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) 103 | resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir) 104 | except EnvironmentError: 105 | logger.error( 106 | "Model name '{}' was not found in model name list ({}). " 107 | "We assumed '{}' was a path or url but couldn't find files {} and {} " 108 | "at this path or url.".format( 109 | pretrained_model_name_or_path, 110 | ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), 111 | pretrained_model_name_or_path, 112 | vocab_file, merges_file)) 113 | return None 114 | if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file: 115 | logger.info("loading vocabulary file {}".format(vocab_file)) 116 | logger.info("loading merges file {}".format(merges_file)) 117 | else: 118 | logger.info("loading vocabulary file {} from cache at {}".format( 119 | vocab_file, resolved_vocab_file)) 120 | logger.info("loading merges file {} from cache at {}".format( 121 | merges_file, resolved_merges_file)) 122 | if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: 123 | # if we're using a pretrained model, ensure the tokenizer wont index sequences longer 124 | # than the number of positional embeddings 125 | max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] 126 | kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) 127 | # Instantiate tokenizer. 128 | tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs) 129 | return tokenizer 130 | 131 | def __init__(self, vocab_file, merges_file, errors='replace', max_len=None): 132 | self.max_len = max_len if max_len is not None else int(1e12) 133 | self.encoder = json.load(open(vocab_file)) 134 | self.decoder = {v:k for k,v in self.encoder.items()} 135 | self.errors = errors # how to handle errors in decoding 136 | self.byte_encoder = bytes_to_unicode() 137 | self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} 138 | bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] 139 | bpe_merges = [tuple(merge.split()) for merge in bpe_data] 140 | self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) 141 | self.cache = {} 142 | 143 | # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions 144 | self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") 145 | 146 | def __len__(self): 147 | return len(self.encoder) 148 | 149 | def bpe(self, token): 150 | if token in self.cache: 151 | return self.cache[token] 152 | word = tuple(token) 153 | pairs = get_pairs(word) 154 | 155 | if not pairs: 156 | return token 157 | 158 | while True: 159 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 160 | if bigram not in self.bpe_ranks: 161 | break 162 | first, second = bigram 163 | new_word = [] 164 | i = 0 165 | while i < len(word): 166 | try: 167 | j = word.index(first, i) 168 | new_word.extend(word[i:j]) 169 | i = j 170 | except: 171 | new_word.extend(word[i:]) 172 | break 173 | 174 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 175 | new_word.append(first+second) 176 | i += 2 177 | else: 178 | new_word.append(word[i]) 179 | i += 1 180 | new_word = tuple(new_word) 181 | word = new_word 182 | if len(word) == 1: 183 | break 184 | else: 185 | pairs = get_pairs(word) 186 | word = ' '.join(word) 187 | self.cache[token] = word 188 | return word 189 | 190 | def encode(self, text): 191 | bpe_tokens = [] 192 | for token in re.findall(self.pat, text): 193 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 194 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 195 | if len(bpe_tokens) > self.max_len: 196 | logger.warning( 197 | "Token indices sequence length is longer than the specified maximum " 198 | " sequence length for this OpenAI GPT-2 model ({} > {}). Running this" 199 | " sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len) 200 | ) 201 | return bpe_tokens 202 | 203 | def decode(self, tokens): 204 | text = ''.join([self.decoder[token] for token in tokens]) 205 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) 206 | return text 207 | -------------------------------------------------------------------------------- /tests/modeling_transfo_xl_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import json 21 | import random 22 | 23 | import torch 24 | 25 | from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel) 26 | 27 | 28 | class TransfoXLModelTest(unittest.TestCase): 29 | class TransfoXLModelTester(object): 30 | 31 | def __init__(self, 32 | parent, 33 | batch_size=13, 34 | seq_length=7, 35 | mem_len=30, 36 | clamp_len=15, 37 | is_training=True, 38 | use_labels=True, 39 | vocab_size=99, 40 | cutoffs=[10, 50, 80], 41 | d_model=32, 42 | d_embed=32, 43 | n_head=4, 44 | d_head=8, 45 | d_inner=128, 46 | div_val=2, 47 | n_layer=5, 48 | scope=None, 49 | seed=1): 50 | self.parent = parent 51 | self.batch_size = batch_size 52 | self.seq_length = seq_length 53 | self.mem_len = mem_len 54 | self.clamp_len = clamp_len 55 | self.is_training = is_training 56 | self.use_labels = use_labels 57 | self.vocab_size = vocab_size 58 | self.cutoffs = cutoffs 59 | self.d_model = d_model 60 | self.d_embed = d_embed 61 | self.n_head = n_head 62 | self.d_head = d_head 63 | self.d_inner = d_inner 64 | self.div_val = div_val 65 | self.n_layer = n_layer 66 | self.scope = scope 67 | self.seed = seed 68 | 69 | def prepare_config_and_inputs(self): 70 | input_ids_1 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size) 71 | input_ids_2 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size) 72 | 73 | lm_labels = None 74 | if self.use_labels: 75 | lm_labels = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size) 76 | 77 | config = TransfoXLConfig( 78 | vocab_size_or_config_json_file=self.vocab_size, 79 | mem_len=self.mem_len, 80 | clamp_len=self.clamp_len, 81 | cutoffs=self.cutoffs, 82 | d_model=self.d_model, 83 | d_embed=self.d_embed, 84 | n_head=self.n_head, 85 | d_head=self.d_head, 86 | d_inner=self.d_inner, 87 | div_val=self.div_val, 88 | n_layer=self.n_layer) 89 | 90 | return (config, input_ids_1, input_ids_2, lm_labels) 91 | 92 | def set_seed(self): 93 | random.seed(self.seed) 94 | torch.manual_seed(self.seed) 95 | 96 | def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels): 97 | model = TransfoXLModel(config) 98 | model.eval() 99 | 100 | hidden_states_1, mems_1 = model(input_ids_1) 101 | hidden_states_2, mems_2 = model(input_ids_2, mems_1) 102 | outputs = { 103 | "hidden_states_1": hidden_states_1, 104 | "mems_1": mems_1, 105 | "hidden_states_2": hidden_states_2, 106 | "mems_2": mems_2, 107 | } 108 | return outputs 109 | 110 | def check_transfo_xl_model_output(self, result): 111 | self.parent.assertListEqual( 112 | list(result["hidden_states_1"].size()), 113 | [self.batch_size, self.seq_length, self.d_model]) 114 | self.parent.assertListEqual( 115 | list(result["hidden_states_2"].size()), 116 | [self.batch_size, self.seq_length, self.d_model]) 117 | self.parent.assertListEqual( 118 | list(list(mem.size()) for mem in result["mems_1"]), 119 | [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer) 120 | self.parent.assertListEqual( 121 | list(list(mem.size()) for mem in result["mems_2"]), 122 | [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer) 123 | 124 | 125 | def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels): 126 | model = TransfoXLLMHeadModel(config) 127 | model.eval() 128 | 129 | loss_1, mems_1a = model(input_ids_1, target=lm_labels) 130 | lm_logits_1, mems_1b = model(input_ids_1) 131 | 132 | loss_2, mems_2a = model(input_ids_2, target=lm_labels, mems=mems_1a) 133 | lm_logits_2, mems_2b = model(input_ids_2, mems=mems_1b) 134 | 135 | outputs = { 136 | "loss_1": loss_1, 137 | "mems_1a": mems_1a, 138 | "lm_logits_1": lm_logits_1, 139 | "mems_1b": mems_1b, 140 | "loss_2": loss_2, 141 | "mems_2a": mems_2a, 142 | "lm_logits_2": lm_logits_2, 143 | "mems_2b": mems_2b, 144 | } 145 | return outputs 146 | 147 | def check_transfo_xl_lm_head_output(self, result): 148 | self.parent.assertListEqual( 149 | list(result["loss_1"].size()), 150 | [self.batch_size, self.seq_length]) 151 | self.parent.assertListEqual( 152 | list(result["lm_logits_1"].size()), 153 | [self.batch_size, self.seq_length, self.vocab_size]) 154 | self.parent.assertListEqual( 155 | list(list(mem.size()) for mem in result["mems_1a"]), 156 | [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer) 157 | self.parent.assertListEqual( 158 | list(list(mem.size()) for mem in result["mems_1b"]), 159 | [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer) 160 | self.parent.assertListEqual( 161 | list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1a"]), 162 | list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1b"])) 163 | 164 | self.parent.assertListEqual( 165 | list(result["loss_2"].size()), 166 | [self.batch_size, self.seq_length]) 167 | self.parent.assertListEqual( 168 | list(result["lm_logits_2"].size()), 169 | [self.batch_size, self.seq_length, self.vocab_size]) 170 | self.parent.assertListEqual( 171 | list(list(mem.size()) for mem in result["mems_2a"]), 172 | [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer) 173 | self.parent.assertListEqual( 174 | list(list(mem.size()) for mem in result["mems_2b"]), 175 | [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer) 176 | self.parent.assertListEqual( 177 | list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2a"]), 178 | list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2b"])) 179 | 180 | def test_default(self): 181 | self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self)) 182 | 183 | def test_config_to_json_string(self): 184 | config = TransfoXLConfig(vocab_size_or_config_json_file=96, d_embed=37) 185 | obj = json.loads(config.to_json_string()) 186 | self.assertEqual(obj["n_token"], 96) 187 | self.assertEqual(obj["d_embed"], 37) 188 | 189 | def run_tester(self, tester): 190 | config_and_inputs = tester.prepare_config_and_inputs() 191 | 192 | tester.set_seed() 193 | output_result = tester.create_transfo_xl_model(*config_and_inputs) 194 | tester.check_transfo_xl_model_output(output_result) 195 | 196 | tester.set_seed() 197 | output_result = tester.create_transfo_xl_lm_head(*config_and_inputs) 198 | tester.check_transfo_xl_lm_head_output(output_result) 199 | 200 | @classmethod 201 | def ids_tensor(cls, shape, vocab_size, rng=None, name=None): 202 | """Creates a random int32 tensor of the shape within the vocab size.""" 203 | if rng is None: 204 | rng = random.Random() 205 | 206 | total_dims = 1 207 | for dim in shape: 208 | total_dims *= dim 209 | 210 | values = [] 211 | for _ in range(total_dims): 212 | values.append(rng.randint(0, vocab_size - 1)) 213 | 214 | return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous() 215 | 216 | 217 | if __name__ == "__main__": 218 | unittest.main() 219 | -------------------------------------------------------------------------------- /tests/modeling_openai_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import json 21 | import random 22 | 23 | import torch 24 | 25 | from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel, 26 | OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) 27 | 28 | 29 | class OpenAIGPTModelTest(unittest.TestCase): 30 | class OpenAIGPTModelTester(object): 31 | 32 | def __init__(self, 33 | parent, 34 | batch_size=13, 35 | seq_length=7, 36 | is_training=True, 37 | use_position_ids=True, 38 | use_token_type_ids=True, 39 | use_labels=True, 40 | vocab_size=99, 41 | n_special=1, 42 | n_positions=33, 43 | n_embd=32, 44 | n_layer=5, 45 | n_head=4, 46 | n_choices=3, 47 | afn="gelu", 48 | resid_pdrop=0.1, 49 | attn_pdrop=0.1, 50 | embd_pdrop=0.1, 51 | type_sequence_label_size=2, 52 | initializer_range=0.02, 53 | num_labels=3, 54 | scope=None): 55 | self.parent = parent 56 | self.batch_size = batch_size 57 | self.seq_length = seq_length 58 | self.is_training = is_training 59 | self.use_position_ids = use_position_ids 60 | self.use_token_type_ids = use_token_type_ids 61 | self.use_labels = use_labels 62 | self.vocab_size = vocab_size 63 | self.n_special = n_special 64 | self.n_positions = n_positions 65 | self.n_embd = n_embd 66 | self.n_layer = n_layer 67 | self.n_head = n_head 68 | self.afn = afn 69 | self.n_choices = n_choices 70 | self.resid_pdrop = resid_pdrop 71 | self.attn_pdrop = attn_pdrop 72 | self.embd_pdrop = embd_pdrop 73 | self.type_sequence_label_size = type_sequence_label_size 74 | self.initializer_range = initializer_range 75 | self.num_labels = num_labels 76 | self.scope = scope 77 | 78 | def prepare_config_and_inputs(self): 79 | input_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size) 80 | 81 | position_ids = None 82 | if self.use_position_ids: 83 | position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions) 84 | 85 | token_type_ids = None 86 | if self.use_token_type_ids: 87 | total_voc = self.vocab_size + self.n_special 88 | token_type_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc) 89 | 90 | mc_labels = None 91 | lm_labels = None 92 | mc_token_ids = None 93 | if self.use_labels: 94 | mc_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size) 95 | lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels) 96 | mc_token_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length) 97 | 98 | config = OpenAIGPTConfig( 99 | vocab_size_or_config_json_file=self.vocab_size, 100 | n_positions=self.n_positions, 101 | n_special=self.n_special, 102 | n_embd=self.n_embd, 103 | n_layer=self.n_layer, 104 | n_head=self.n_head, 105 | afn=self.afn, 106 | resid_pdrop=self.resid_pdrop, 107 | attn_pdrop=self.attn_pdrop, 108 | embd_pdrop=self.embd_pdrop, 109 | initializer_range=self.initializer_range) 110 | 111 | return (config, input_ids, token_type_ids, position_ids, 112 | mc_labels, lm_labels, mc_token_ids) 113 | 114 | def create_openai_model(self, config, input_ids, token_type_ids, position_ids, 115 | mc_labels, lm_labels, mc_token_ids): 116 | model = OpenAIGPTModel(config) 117 | model.eval() 118 | hidden_states = model(input_ids, position_ids, token_type_ids) 119 | outputs = { 120 | "hidden_states": hidden_states, 121 | } 122 | return outputs 123 | 124 | def check_openai_model_output(self, result): 125 | self.parent.assertListEqual( 126 | list(result["hidden_states"].size()), 127 | [self.batch_size, self.n_choices, self.seq_length, self.n_embd]) 128 | 129 | 130 | def create_openai_lm_head(self, config, input_ids, token_type_ids, position_ids, 131 | mc_labels, lm_labels, mc_token_ids): 132 | model = OpenAIGPTLMHeadModel(config) 133 | model.eval() 134 | loss = model(input_ids, position_ids, token_type_ids, lm_labels) 135 | lm_logits = model(input_ids, position_ids, token_type_ids) 136 | outputs = { 137 | "loss": loss, 138 | "lm_logits": lm_logits, 139 | } 140 | return outputs 141 | 142 | def check_openai_lm_head_output(self, result): 143 | total_voc = self.n_special + self.vocab_size 144 | self.parent.assertListEqual( 145 | list(result["lm_logits"].size()), 146 | [self.batch_size, self.n_choices, self.seq_length, total_voc]) 147 | 148 | def check_openai_lm_head_loss_output(self, result): 149 | self.parent.assertListEqual( 150 | list(result["loss"].size()), 151 | []) 152 | 153 | def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids, 154 | mc_labels, lm_labels, mc_token_ids): 155 | model = OpenAIGPTDoubleHeadsModel(config) 156 | model.eval() 157 | loss = model(input_ids, mc_token_ids, 158 | lm_labels=lm_labels, mc_labels=mc_labels, 159 | token_type_ids=token_type_ids, position_ids=position_ids) 160 | lm_logits, mc_logits = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids) 161 | outputs = { 162 | "loss": loss, 163 | "lm_logits": lm_logits, 164 | "mc_logits": mc_logits, 165 | } 166 | return outputs 167 | 168 | def check_openai_double_heads_output(self, result): 169 | total_voc = self.n_special + self.vocab_size 170 | self.parent.assertListEqual( 171 | list(result["lm_logits"].size()), 172 | [self.batch_size, self.n_choices, self.seq_length, total_voc]) 173 | self.parent.assertListEqual( 174 | list(result["mc_logits"].size()), 175 | [self.batch_size, self.n_choices]) 176 | 177 | def check_openai_double_heads_loss_output(self, result): 178 | self.parent.assertListEqual( 179 | [list(l.size()) for l in result["loss"]], 180 | [[], []]) 181 | 182 | def test_default(self): 183 | self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self)) 184 | 185 | def test_config_to_json_string(self): 186 | config = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37) 187 | obj = json.loads(config.to_json_string()) 188 | self.assertEqual(obj["vocab_size"], 99) 189 | self.assertEqual(obj["n_embd"], 37) 190 | 191 | def run_tester(self, tester): 192 | config_and_inputs = tester.prepare_config_and_inputs() 193 | output_result = tester.create_openai_model(*config_and_inputs) 194 | tester.check_openai_model_output(output_result) 195 | 196 | output_result = tester.create_openai_lm_head(*config_and_inputs) 197 | tester.check_openai_lm_head_output(output_result) 198 | tester.check_openai_lm_head_loss_output(output_result) 199 | 200 | output_result = tester.create_openai_double_heads(*config_and_inputs) 201 | tester.check_openai_double_heads_output(output_result) 202 | tester.check_openai_double_heads_loss_output(output_result) 203 | 204 | @classmethod 205 | def ids_tensor(cls, shape, vocab_size, rng=None, name=None): 206 | """Creates a random int32 tensor of the shape within the vocab size.""" 207 | if rng is None: 208 | rng = random.Random() 209 | 210 | total_dims = 1 211 | for dim in shape: 212 | total_dims *= dim 213 | 214 | values = [] 215 | for _ in range(total_dims): 216 | values.append(rng.randint(0, vocab_size - 1)) 217 | 218 | return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous() 219 | 220 | 221 | if __name__ == "__main__": 222 | unittest.main() 223 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /pytorch_pretrained_bert/tokenization_openai.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for OpenAI GPT.""" 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import json 20 | import logging 21 | import os 22 | import re 23 | import sys 24 | from io import open 25 | 26 | from tqdm import tqdm 27 | 28 | from .file_utils import cached_path 29 | from .tokenization import BasicTokenizer 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | PRETRAINED_VOCAB_ARCHIVE_MAP = { 34 | 'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json", 35 | } 36 | PRETRAINED_MERGES_ARCHIVE_MAP = { 37 | 'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt", 38 | } 39 | PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { 40 | 'openai-gpt': 512, 41 | } 42 | VOCAB_NAME = 'vocab.json' 43 | MERGES_NAME = 'merges.txt' 44 | 45 | def get_pairs(word): 46 | """ 47 | Return set of symbol pairs in a word. 48 | word is represented as tuple of symbols (symbols being variable-length strings) 49 | """ 50 | pairs = set() 51 | prev_char = word[0] 52 | for char in word[1:]: 53 | pairs.add((prev_char, char)) 54 | prev_char = char 55 | return pairs 56 | 57 | def text_standardize(text): 58 | """ 59 | fixes some issues the spacy tokenizer had on books corpus 60 | also does some whitespace standardization 61 | """ 62 | text = text.replace('—', '-') 63 | text = text.replace('–', '-') 64 | text = text.replace('―', '-') 65 | text = text.replace('…', '...') 66 | text = text.replace('´', "'") 67 | text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text) 68 | text = re.sub(r'\s*\n\s*', ' \n ', text) 69 | text = re.sub(r'[^\S\n]+', ' ', text) 70 | return text.strip() 71 | 72 | class OpenAIGPTTokenizer(object): 73 | """ 74 | BPE tokenizer. Peculiarities: 75 | - lower case all inputs 76 | - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. 77 | - argument special_tokens and function set_special_tokens: 78 | can be used to add additional symbols (ex: "__classify__") to a vocabulary. 79 | """ 80 | @classmethod 81 | def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): 82 | """ 83 | Instantiate a PreTrainedBertModel from a pre-trained model file. 84 | Download and cache the pre-trained model file if needed. 85 | """ 86 | if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: 87 | vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] 88 | merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path] 89 | else: 90 | vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) 91 | merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) 92 | # redirect to the cache, if necessary 93 | try: 94 | resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) 95 | resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir) 96 | except EnvironmentError: 97 | logger.error( 98 | "Model name '{}' was not found in model name list ({}). " 99 | "We assumed '{}' was a path or url but couldn't find files {} and {} " 100 | "at this path or url.".format( 101 | pretrained_model_name_or_path, 102 | ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), 103 | pretrained_model_name_or_path, 104 | vocab_file, merges_file)) 105 | return None 106 | if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file: 107 | logger.info("loading vocabulary file {}".format(vocab_file)) 108 | logger.info("loading merges file {}".format(merges_file)) 109 | else: 110 | logger.info("loading vocabulary file {} from cache at {}".format( 111 | vocab_file, resolved_vocab_file)) 112 | logger.info("loading merges file {} from cache at {}".format( 113 | merges_file, resolved_merges_file)) 114 | if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: 115 | # if we're using a pretrained model, ensure the tokenizer wont index sequences longer 116 | # than the number of positional embeddings 117 | max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] 118 | kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) 119 | # Instantiate tokenizer. 120 | tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs) 121 | return tokenizer 122 | 123 | def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None): 124 | try: 125 | import ftfy 126 | import spacy 127 | self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) 128 | self.fix_text = ftfy.fix_text 129 | except ImportError: 130 | logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") 131 | self.nlp = BasicTokenizer(do_lower_case=True, 132 | never_split=special_tokens if special_tokens is not None else []) 133 | self.fix_text = None 134 | 135 | self.max_len = max_len if max_len is not None else int(1e12) 136 | self.encoder = json.load(open(vocab_file, encoding="utf-8")) 137 | self.decoder = {v:k for k,v in self.encoder.items()} 138 | merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] 139 | merges = [tuple(merge.split()) for merge in merges] 140 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 141 | self.cache = {} 142 | self.set_special_tokens(special_tokens) 143 | 144 | def __len__(self): 145 | return len(self.encoder) + len(self.special_tokens) 146 | 147 | def set_special_tokens(self, special_tokens): 148 | """ Add a list of additional tokens to the encoder. 149 | The additional tokens are indexed starting from the last index of the 150 | current vocabulary in the order of the `special_tokens` list. 151 | """ 152 | if not special_tokens: 153 | self.special_tokens = {} 154 | self.special_tokens_decoder = {} 155 | return 156 | self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)) 157 | self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()} 158 | if self.fix_text is None: 159 | # Using BERT's BasicTokenizer: we can update the tokenizer 160 | self.nlp.never_split = special_tokens 161 | logger.info("Special tokens {}".format(self.special_tokens)) 162 | 163 | def bpe(self, token): 164 | word = tuple(token[:-1]) + (token[-1] + '',) 165 | if token in self.cache: 166 | return self.cache[token] 167 | pairs = get_pairs(word) 168 | 169 | if not pairs: 170 | return token+'' 171 | 172 | while True: 173 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) 174 | if bigram not in self.bpe_ranks: 175 | break 176 | first, second = bigram 177 | new_word = [] 178 | i = 0 179 | while i < len(word): 180 | try: 181 | j = word.index(first, i) 182 | new_word.extend(word[i:j]) 183 | i = j 184 | except: 185 | new_word.extend(word[i:]) 186 | break 187 | 188 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 189 | new_word.append(first+second) 190 | i += 2 191 | else: 192 | new_word.append(word[i]) 193 | i += 1 194 | new_word = tuple(new_word) 195 | word = new_word 196 | if len(word) == 1: 197 | break 198 | else: 199 | pairs = get_pairs(word) 200 | word = ' '.join(word) 201 | if word == '\n ': 202 | word = '\n' 203 | self.cache[token] = word 204 | return word 205 | 206 | def tokenize(self, text): 207 | """ Tokenize a string. """ 208 | split_tokens = [] 209 | if self.fix_text is None: 210 | # Using BERT's BasicTokenizer 211 | text = self.nlp.tokenize(text) 212 | for token in text: 213 | split_tokens.extend([t for t in self.bpe(token).split(' ')]) 214 | else: 215 | # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) 216 | text = self.nlp(text_standardize(self.fix_text(text))) 217 | for token in text: 218 | split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')]) 219 | return split_tokens 220 | 221 | def convert_tokens_to_ids(self, tokens): 222 | """ Converts a sequence of tokens into ids using the vocab. """ 223 | ids = [] 224 | if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)): 225 | if tokens in self.special_tokens: 226 | return self.special_tokens[tokens] 227 | else: 228 | return self.encoder.get(tokens, 0) 229 | for token in tokens: 230 | if token in self.special_tokens: 231 | ids.append(self.special_tokens[token]) 232 | else: 233 | ids.append(self.encoder.get(token, 0)) 234 | if len(ids) > self.max_len: 235 | logger.warning( 236 | "Token indices sequence length is longer than the specified maximum " 237 | " sequence length for this OpenAI GPT model ({} > {}). Running this" 238 | " sequence through the model will result in indexing errors".format(len(ids), self.max_len) 239 | ) 240 | return ids 241 | 242 | def convert_ids_to_tokens(self, ids, skip_special_tokens=False): 243 | """Converts a sequence of ids in BPE tokens using the vocab.""" 244 | tokens = [] 245 | for i in ids: 246 | if i in self.special_tokens_decoder: 247 | if not skip_special_tokens: 248 | tokens.append(self.special_tokens_decoder[i]) 249 | else: 250 | tokens.append(self.decoder[i]) 251 | return tokens 252 | 253 | def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=False): 254 | """Converts a sequence of ids in a string.""" 255 | tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens) 256 | out_string = ''.join(tokens).replace('', ' ').strip() 257 | if clean_up_tokenization_spaces: 258 | out_string = out_string.replace('', '') 259 | out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ',' 260 | ).replace(" n't", "n't").replace(" 'm", "'m").replace(" 're", "'re").replace(" do not", " don't" 261 | ).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m " 262 | ).replace(" 've", "'ve") 263 | return out_string 264 | -------------------------------------------------------------------------------- /examples/extract_features.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Extract pre-computed feature vectors from a PyTorch BERT model.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import argparse 22 | import collections 23 | import logging 24 | import json 25 | import re 26 | 27 | import torch 28 | from torch.utils.data import TensorDataset, DataLoader, SequentialSampler 29 | from torch.utils.data.distributed import DistributedSampler 30 | 31 | from pytorch_pretrained_bert.tokenization import BertTokenizer 32 | from pytorch_pretrained_bert.modeling import BertModel 33 | 34 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 35 | datefmt = '%m/%d/%Y %H:%M:%S', 36 | level = logging.INFO) 37 | logger = logging.getLogger(__name__) 38 | 39 | 40 | class InputExample(object): 41 | 42 | def __init__(self, unique_id, text_a, text_b): 43 | self.unique_id = unique_id 44 | self.text_a = text_a 45 | self.text_b = text_b 46 | 47 | 48 | class InputFeatures(object): 49 | """A single set of features of data.""" 50 | 51 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): 52 | self.unique_id = unique_id 53 | self.tokens = tokens 54 | self.input_ids = input_ids 55 | self.input_mask = input_mask 56 | self.input_type_ids = input_type_ids 57 | 58 | 59 | def convert_examples_to_features(examples, seq_length, tokenizer): 60 | """Loads a data file into a list of `InputFeature`s.""" 61 | 62 | features = [] 63 | for (ex_index, example) in enumerate(examples): 64 | tokens_a = tokenizer.tokenize(example.text_a) 65 | 66 | tokens_b = None 67 | if example.text_b: 68 | tokens_b = tokenizer.tokenize(example.text_b) 69 | 70 | if tokens_b: 71 | # Modifies `tokens_a` and `tokens_b` in place so that the total 72 | # length is less than the specified length. 73 | # Account for [CLS], [SEP], [SEP] with "- 3" 74 | _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) 75 | else: 76 | # Account for [CLS] and [SEP] with "- 2" 77 | if len(tokens_a) > seq_length - 2: 78 | tokens_a = tokens_a[0:(seq_length - 2)] 79 | 80 | # The convention in BERT is: 81 | # (a) For sequence pairs: 82 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 83 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 84 | # (b) For single sequences: 85 | # tokens: [CLS] the dog is hairy . [SEP] 86 | # type_ids: 0 0 0 0 0 0 0 87 | # 88 | # Where "type_ids" are used to indicate whether this is the first 89 | # sequence or the second sequence. The embedding vectors for `type=0` and 90 | # `type=1` were learned during pre-training and are added to the wordpiece 91 | # embedding vector (and position vector). This is not *strictly* necessary 92 | # since the [SEP] token unambigiously separates the sequences, but it makes 93 | # it easier for the model to learn the concept of sequences. 94 | # 95 | # For classification tasks, the first vector (corresponding to [CLS]) is 96 | # used as as the "sentence vector". Note that this only makes sense because 97 | # the entire model is fine-tuned. 98 | tokens = [] 99 | input_type_ids = [] 100 | tokens.append("[CLS]") 101 | input_type_ids.append(0) 102 | for token in tokens_a: 103 | tokens.append(token) 104 | input_type_ids.append(0) 105 | tokens.append("[SEP]") 106 | input_type_ids.append(0) 107 | 108 | if tokens_b: 109 | for token in tokens_b: 110 | tokens.append(token) 111 | input_type_ids.append(1) 112 | tokens.append("[SEP]") 113 | input_type_ids.append(1) 114 | 115 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 116 | 117 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 118 | # tokens are attended to. 119 | input_mask = [1] * len(input_ids) 120 | 121 | # Zero-pad up to the sequence length. 122 | while len(input_ids) < seq_length: 123 | input_ids.append(0) 124 | input_mask.append(0) 125 | input_type_ids.append(0) 126 | 127 | assert len(input_ids) == seq_length 128 | assert len(input_mask) == seq_length 129 | assert len(input_type_ids) == seq_length 130 | 131 | if ex_index < 5: 132 | logger.info("*** Example ***") 133 | logger.info("unique_id: %s" % (example.unique_id)) 134 | logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) 135 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 136 | logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 137 | logger.info( 138 | "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) 139 | 140 | features.append( 141 | InputFeatures( 142 | unique_id=example.unique_id, 143 | tokens=tokens, 144 | input_ids=input_ids, 145 | input_mask=input_mask, 146 | input_type_ids=input_type_ids)) 147 | return features 148 | 149 | 150 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 151 | """Truncates a sequence pair in place to the maximum length.""" 152 | 153 | # This is a simple heuristic which will always truncate the longer sequence 154 | # one token at a time. This makes more sense than truncating an equal percent 155 | # of tokens from each, since if one sequence is very short then each token 156 | # that's truncated likely contains more information than a longer sequence. 157 | while True: 158 | total_length = len(tokens_a) + len(tokens_b) 159 | if total_length <= max_length: 160 | break 161 | if len(tokens_a) > len(tokens_b): 162 | tokens_a.pop() 163 | else: 164 | tokens_b.pop() 165 | 166 | 167 | def read_examples(input_file): 168 | """Read a list of `InputExample`s from an input file.""" 169 | examples = [] 170 | unique_id = 0 171 | with open(input_file, "r", encoding='utf-8') as reader: 172 | while True: 173 | line = reader.readline() 174 | if not line: 175 | break 176 | line = line.strip() 177 | text_a = None 178 | text_b = None 179 | m = re.match(r"^(.*) \|\|\| (.*)$", line) 180 | if m is None: 181 | text_a = line 182 | else: 183 | text_a = m.group(1) 184 | text_b = m.group(2) 185 | examples.append( 186 | InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) 187 | unique_id += 1 188 | return examples 189 | 190 | 191 | def main(): 192 | parser = argparse.ArgumentParser() 193 | 194 | ## Required parameters 195 | parser.add_argument("--input_file", default=None, type=str, required=True) 196 | parser.add_argument("--output_file", default=None, type=str, required=True) 197 | parser.add_argument("--bert_model", default=None, type=str, required=True, 198 | help="Bert pre-trained model selected in the list: bert-base-uncased, " 199 | "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") 200 | 201 | ## Other parameters 202 | parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") 203 | parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) 204 | parser.add_argument("--max_seq_length", default=128, type=int, 205 | help="The maximum total input sequence length after WordPiece tokenization. Sequences longer " 206 | "than this will be truncated, and sequences shorter than this will be padded.") 207 | parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") 208 | parser.add_argument("--local_rank", 209 | type=int, 210 | default=-1, 211 | help = "local_rank for distributed training on gpus") 212 | parser.add_argument("--no_cuda", 213 | action='store_true', 214 | help="Whether not to use CUDA when available") 215 | 216 | args = parser.parse_args() 217 | 218 | if args.local_rank == -1 or args.no_cuda: 219 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 220 | n_gpu = torch.cuda.device_count() 221 | else: 222 | device = torch.device("cuda", args.local_rank) 223 | n_gpu = 1 224 | # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 225 | torch.distributed.init_process_group(backend='nccl') 226 | logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1))) 227 | 228 | layer_indexes = [int(x) for x in args.layers.split(",")] 229 | 230 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 231 | 232 | examples = read_examples(args.input_file) 233 | 234 | features = convert_examples_to_features( 235 | examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer) 236 | 237 | unique_id_to_feature = {} 238 | for feature in features: 239 | unique_id_to_feature[feature.unique_id] = feature 240 | 241 | model = BertModel.from_pretrained(args.bert_model) 242 | model.to(device) 243 | 244 | if args.local_rank != -1: 245 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], 246 | output_device=args.local_rank) 247 | elif n_gpu > 1: 248 | model = torch.nn.DataParallel(model) 249 | 250 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) 251 | all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) 252 | all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) 253 | 254 | eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) 255 | if args.local_rank == -1: 256 | eval_sampler = SequentialSampler(eval_data) 257 | else: 258 | eval_sampler = DistributedSampler(eval_data) 259 | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) 260 | 261 | model.eval() 262 | with open(args.output_file, "w", encoding='utf-8') as writer: 263 | for input_ids, input_mask, example_indices in eval_dataloader: 264 | input_ids = input_ids.to(device) 265 | input_mask = input_mask.to(device) 266 | 267 | all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) 268 | all_encoder_layers = all_encoder_layers 269 | 270 | for b, example_index in enumerate(example_indices): 271 | feature = features[example_index.item()] 272 | unique_id = int(feature.unique_id) 273 | # feature = unique_id_to_feature[unique_id] 274 | output_json = collections.OrderedDict() 275 | output_json["linex_index"] = unique_id 276 | all_out_features = [] 277 | for (i, token) in enumerate(feature.tokens): 278 | all_layers = [] 279 | for (j, layer_index) in enumerate(layer_indexes): 280 | layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy() 281 | layer_output = layer_output[b] 282 | layers = collections.OrderedDict() 283 | layers["index"] = layer_index 284 | layers["values"] = [ 285 | round(x.item(), 6) for x in layer_output[i] 286 | ] 287 | all_layers.append(layers) 288 | out_features = collections.OrderedDict() 289 | out_features["token"] = token 290 | out_features["layers"] = all_layers 291 | all_out_features.append(out_features) 292 | output_json["features"] = all_out_features 293 | writer.write(json.dumps(output_json) + "\n") 294 | 295 | 296 | if __name__ == "__main__": 297 | main() 298 | -------------------------------------------------------------------------------- /examples/lm_finetuning/pregenerate_training_data.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from pathlib import Path 3 | from tqdm import tqdm, trange 4 | from tempfile import TemporaryDirectory 5 | import shelve 6 | 7 | from random import random, randint, shuffle, choice, sample 8 | from pytorch_pretrained_bert.tokenization import BertTokenizer 9 | import numpy as np 10 | import json 11 | 12 | 13 | class DocumentDatabase: 14 | def __init__(self, reduce_memory=False): 15 | if reduce_memory: 16 | self.temp_dir = TemporaryDirectory() 17 | self.working_dir = Path(self.temp_dir.name) 18 | self.document_shelf_filepath = self.working_dir / 'shelf.db' 19 | self.document_shelf = shelve.open(str(self.document_shelf_filepath), 20 | flag='n', protocol=-1) 21 | self.documents = None 22 | else: 23 | self.documents = [] 24 | self.document_shelf = None 25 | self.document_shelf_filepath = None 26 | self.temp_dir = None 27 | self.doc_lengths = [] 28 | self.doc_cumsum = None 29 | self.cumsum_max = None 30 | self.reduce_memory = reduce_memory 31 | 32 | def add_document(self, document): 33 | if self.reduce_memory: 34 | current_idx = len(self.doc_lengths) 35 | self.document_shelf[str(current_idx)] = document 36 | else: 37 | self.documents.append(document) 38 | self.doc_lengths.append(len(document)) 39 | 40 | def _precalculate_doc_weights(self): 41 | self.doc_cumsum = np.cumsum(self.doc_lengths) 42 | self.cumsum_max = self.doc_cumsum[-1] 43 | 44 | def sample_doc(self, current_idx, sentence_weighted=True): 45 | # Uses the current iteration counter to ensure we don't sample the same doc twice 46 | if sentence_weighted: 47 | # With sentence weighting, we sample docs proportionally to their sentence length 48 | if self.doc_cumsum is None or len(self.doc_cumsum) != len(self.doc_lengths): 49 | self._precalculate_doc_weights() 50 | rand_start = self.doc_cumsum[current_idx] 51 | rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx] 52 | sentence_index = randint(rand_start, rand_end-1) % self.cumsum_max 53 | sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right') 54 | else: 55 | # If we don't use sentence weighting, then every doc has an equal chance to be chosen 56 | sampled_doc_index = current_idx + randint(1, len(self.doc_lengths)-1) 57 | assert sampled_doc_index != current_idx 58 | if self.reduce_memory: 59 | return self.document_shelf[str(sampled_doc_index)] 60 | else: 61 | return self.documents[sampled_doc_index] 62 | 63 | def __len__(self): 64 | return len(self.doc_lengths) 65 | 66 | def __getitem__(self, item): 67 | if self.reduce_memory: 68 | return self.document_shelf[str(item)] 69 | else: 70 | return self.documents[item] 71 | 72 | def __enter__(self): 73 | return self 74 | 75 | def __exit__(self, exc_type, exc_val, traceback): 76 | if self.document_shelf is not None: 77 | self.document_shelf.close() 78 | if self.temp_dir is not None: 79 | self.temp_dir.cleanup() 80 | 81 | 82 | def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens): 83 | """Truncates a pair of sequences to a maximum sequence length. Lifted from Google's BERT repo.""" 84 | while True: 85 | total_length = len(tokens_a) + len(tokens_b) 86 | if total_length <= max_num_tokens: 87 | break 88 | 89 | trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b 90 | assert len(trunc_tokens) >= 1 91 | 92 | # We want to sometimes truncate from the front and sometimes from the 93 | # back to add more randomness and avoid biases. 94 | if random() < 0.5: 95 | del trunc_tokens[0] 96 | else: 97 | trunc_tokens.pop() 98 | 99 | 100 | def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_list): 101 | """Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but 102 | with several refactors to clean it up and remove a lot of unnecessary variables.""" 103 | cand_indices = [] 104 | for (i, token) in enumerate(tokens): 105 | if token == "[CLS]" or token == "[SEP]": 106 | continue 107 | cand_indices.append(i) 108 | 109 | num_to_mask = min(max_predictions_per_seq, 110 | max(1, int(round(len(tokens) * masked_lm_prob)))) 111 | shuffle(cand_indices) 112 | mask_indices = sorted(sample(cand_indices, num_to_mask)) 113 | masked_token_labels = [] 114 | for index in mask_indices: 115 | # 80% of the time, replace with [MASK] 116 | if random() < 0.8: 117 | masked_token = "[MASK]" 118 | else: 119 | # 10% of the time, keep original 120 | if random() < 0.5: 121 | masked_token = tokens[index] 122 | # 10% of the time, replace with random word 123 | else: 124 | masked_token = choice(vocab_list) 125 | masked_token_labels.append(tokens[index]) 126 | # Once we've saved the true label for that token, we can overwrite it with the masked version 127 | tokens[index] = masked_token 128 | 129 | return tokens, mask_indices, masked_token_labels 130 | 131 | 132 | def create_instances_from_document( 133 | doc_database, doc_idx, max_seq_length, short_seq_prob, 134 | masked_lm_prob, max_predictions_per_seq, vocab_list): 135 | """This code is mostly a duplicate of the equivalent function from Google BERT's repo. 136 | However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function. 137 | Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence 138 | (rather than each document) has an equal chance of being sampled as a false example for the NextSentence task.""" 139 | document = doc_database[doc_idx] 140 | # Account for [CLS], [SEP], [SEP] 141 | max_num_tokens = max_seq_length - 3 142 | 143 | # We *usually* want to fill up the entire sequence since we are padding 144 | # to `max_seq_length` anyways, so short sequences are generally wasted 145 | # computation. However, we *sometimes* 146 | # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter 147 | # sequences to minimize the mismatch between pre-training and fine-tuning. 148 | # The `target_seq_length` is just a rough target however, whereas 149 | # `max_seq_length` is a hard limit. 150 | target_seq_length = max_num_tokens 151 | if random() < short_seq_prob: 152 | target_seq_length = randint(2, max_num_tokens) 153 | 154 | # We DON'T just concatenate all of the tokens from a document into a long 155 | # sequence and choose an arbitrary split point because this would make the 156 | # next sentence prediction task too easy. Instead, we split the input into 157 | # segments "A" and "B" based on the actual "sentences" provided by the user 158 | # input. 159 | instances = [] 160 | current_chunk = [] 161 | current_length = 0 162 | i = 0 163 | while i < len(document): 164 | segment = document[i] 165 | current_chunk.append(segment) 166 | current_length += len(segment) 167 | if i == len(document) - 1 or current_length >= target_seq_length: 168 | if current_chunk: 169 | # `a_end` is how many segments from `current_chunk` go into the `A` 170 | # (first) sentence. 171 | a_end = 1 172 | if len(current_chunk) >= 2: 173 | a_end = randint(1, len(current_chunk) - 1) 174 | 175 | tokens_a = [] 176 | for j in range(a_end): 177 | tokens_a.extend(current_chunk[j]) 178 | 179 | tokens_b = [] 180 | 181 | # Random next 182 | if len(current_chunk) == 1 or random() < 0.5: 183 | is_random_next = True 184 | target_b_length = target_seq_length - len(tokens_a) 185 | 186 | # Sample a random document, with longer docs being sampled more frequently 187 | random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True) 188 | 189 | random_start = randint(0, len(random_document) - 1) 190 | for j in range(random_start, len(random_document)): 191 | tokens_b.extend(random_document[j]) 192 | if len(tokens_b) >= target_b_length: 193 | break 194 | # We didn't actually use these segments so we "put them back" so 195 | # they don't go to waste. 196 | num_unused_segments = len(current_chunk) - a_end 197 | i -= num_unused_segments 198 | # Actual next 199 | else: 200 | is_random_next = False 201 | for j in range(a_end, len(current_chunk)): 202 | tokens_b.extend(current_chunk[j]) 203 | truncate_seq_pair(tokens_a, tokens_b, max_num_tokens) 204 | 205 | assert len(tokens_a) >= 1 206 | assert len(tokens_b) >= 1 207 | 208 | tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"] 209 | # The segment IDs are 0 for the [CLS] token, the A tokens and the first [SEP] 210 | # They are 1 for the B tokens and the final [SEP] 211 | segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)] 212 | 213 | tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions( 214 | tokens, masked_lm_prob, max_predictions_per_seq, vocab_list) 215 | 216 | instance = { 217 | "tokens": tokens, 218 | "segment_ids": segment_ids, 219 | "is_random_next": is_random_next, 220 | "masked_lm_positions": masked_lm_positions, 221 | "masked_lm_labels": masked_lm_labels} 222 | instances.append(instance) 223 | current_chunk = [] 224 | current_length = 0 225 | i += 1 226 | 227 | return instances 228 | 229 | 230 | def main(): 231 | parser = ArgumentParser() 232 | parser.add_argument('--train_corpus', type=Path, required=True) 233 | parser.add_argument("--output_dir", type=Path, required=True) 234 | parser.add_argument("--bert_model", type=str, required=True, 235 | choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased", 236 | "bert-base-multilingual", "bert-base-chinese"]) 237 | parser.add_argument("--do_lower_case", action="store_true") 238 | 239 | parser.add_argument("--reduce_memory", action="store_true", 240 | help="Reduce memory usage for large datasets by keeping data on disc rather than in memory") 241 | 242 | parser.add_argument("--epochs_to_generate", type=int, default=3, 243 | help="Number of epochs of data to pregenerate") 244 | parser.add_argument("--max_seq_len", type=int, default=128) 245 | parser.add_argument("--short_seq_prob", type=float, default=0.1, 246 | help="Probability of making a short sentence as a training example") 247 | parser.add_argument("--masked_lm_prob", type=float, default=0.15, 248 | help="Probability of masking each token for the LM task") 249 | parser.add_argument("--max_predictions_per_seq", type=int, default=20, 250 | help="Maximum number of tokens to mask in each sequence") 251 | 252 | args = parser.parse_args() 253 | 254 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 255 | vocab_list = list(tokenizer.vocab.keys()) 256 | with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: 257 | with args.train_corpus.open() as f: 258 | doc = [] 259 | for line in tqdm(f, desc="Loading Dataset", unit=" lines"): 260 | line = line.strip() 261 | if line == "": 262 | docs.add_document(doc) 263 | doc = [] 264 | else: 265 | tokens = tokenizer.tokenize(line) 266 | doc.append(tokens) 267 | 268 | args.output_dir.mkdir(exist_ok=True) 269 | for epoch in trange(args.epochs_to_generate, desc="Epoch"): 270 | epoch_filename = args.output_dir / f"epoch_{epoch}.json" 271 | num_instances = 0 272 | with epoch_filename.open('w') as epoch_file: 273 | for doc_idx in trange(len(docs), desc="Document"): 274 | doc_instances = create_instances_from_document( 275 | docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob, 276 | masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, 277 | vocab_list=vocab_list) 278 | doc_instances = [json.dumps(instance) for instance in doc_instances] 279 | for instance in doc_instances: 280 | epoch_file.write(instance + '\n') 281 | num_instances += 1 282 | metrics_file = args.output_dir / f"epoch_{epoch}_metrics.json" 283 | with metrics_file.open('w') as metrics_file: 284 | metrics = { 285 | "num_training_examples": num_instances, 286 | "max_seq_len": args.max_seq_len 287 | } 288 | metrics_file.write(json.dumps(metrics)) 289 | 290 | 291 | if __name__ == '__main__': 292 | main() 293 | -------------------------------------------------------------------------------- /examples/run_openai_gpt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ OpenAI GPT model fine-tuning script. 17 | Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py 18 | It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py 19 | 20 | This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset: 21 | python run_openai_gpt.py \ 22 | --model_name openai-gpt \ 23 | --do_train \ 24 | --do_eval \ 25 | --train_dataset $ROC_STORIES_DIR/cloze_test_val__spring2016\ -\ cloze_test_ALL_val.csv \ 26 | --eval_dataset $ROC_STORIES_DIR/cloze_test_test__spring2016\ -\ cloze_test_ALL_test.csv \ 27 | --output_dir ../log \ 28 | --train_batch_size 16 \ 29 | """ 30 | import argparse 31 | import os 32 | import csv 33 | import random 34 | import logging 35 | from tqdm import tqdm, trange 36 | 37 | import numpy as np 38 | import torch 39 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 40 | TensorDataset) 41 | 42 | from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam, cached_path 43 | 44 | ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz" 45 | 46 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 47 | datefmt = '%m/%d/%Y %H:%M:%S', 48 | level = logging.INFO) 49 | logger = logging.getLogger(__name__) 50 | 51 | def accuracy(out, labels): 52 | outputs = np.argmax(out, axis=1) 53 | return np.sum(outputs == labels) 54 | 55 | def load_rocstories_dataset(dataset_path): 56 | """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """ 57 | with open(dataset_path, encoding='utf_8') as f: 58 | f = csv.reader(f) 59 | output = [] 60 | next(f) # skip the first line 61 | for line in tqdm(f): 62 | output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1)) 63 | return output 64 | 65 | def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token): 66 | """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label) 67 | 68 | To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation: 69 | input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token] 70 | """ 71 | tensor_datasets = [] 72 | for dataset in encoded_datasets: 73 | n_batch = len(dataset) 74 | input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64) 75 | mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64) 76 | lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64) 77 | mc_labels = np.zeros((n_batch,), dtype=np.int64) 78 | for i, (story, cont1, cont2, mc_label), in enumerate(dataset): 79 | with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token] 80 | with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token] 81 | input_ids[i, 0, :len(with_cont1)] = with_cont1 82 | input_ids[i, 1, :len(with_cont2)] = with_cont2 83 | mc_token_ids[i, 0] = len(with_cont1) - 1 84 | mc_token_ids[i, 1] = len(with_cont2) - 1 85 | lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:] 86 | lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:] 87 | mc_labels[i] = mc_label 88 | all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels) 89 | tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs)) 90 | return tensor_datasets 91 | 92 | def main(): 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument('--model_name', type=str, default='openai-gpt', 95 | help='pretrained model name') 96 | parser.add_argument("--do_train", action='store_true', help="Whether to run training.") 97 | parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") 98 | parser.add_argument("--output_dir", default=None, type=str, required=True, 99 | help="The output directory where the model predictions and checkpoints will be written.") 100 | parser.add_argument('--train_dataset', type=str, default='') 101 | parser.add_argument('--eval_dataset', type=str, default='') 102 | parser.add_argument('--seed', type=int, default=42) 103 | parser.add_argument('--num_train_epochs', type=int, default=3) 104 | parser.add_argument('--train_batch_size', type=int, default=8) 105 | parser.add_argument('--eval_batch_size', type=int, default=16) 106 | parser.add_argument('--max_grad_norm', type=int, default=1) 107 | parser.add_argument('--learning_rate', type=float, default=6.25e-5) 108 | parser.add_argument('--warmup_proportion', type=float, default=0.002) 109 | parser.add_argument('--lr_schedule', type=str, default='warmup_linear') 110 | parser.add_argument('--weight_decay', type=float, default=0.01) 111 | parser.add_argument('--lm_coef', type=float, default=0.9) 112 | parser.add_argument('--n_valid', type=int, default=374) 113 | 114 | parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") 115 | parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") 116 | args = parser.parse_args() 117 | print(args) 118 | 119 | if args.server_ip and args.server_port: 120 | # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script 121 | import ptvsd 122 | print("Waiting for debugger attach") 123 | ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) 124 | ptvsd.wait_for_attach() 125 | 126 | random.seed(args.seed) 127 | np.random.seed(args.seed) 128 | torch.manual_seed(args.seed) 129 | torch.cuda.manual_seed_all(args.seed) 130 | 131 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 132 | n_gpu = torch.cuda.device_count() 133 | logger.info("device: {}, n_gpu {}".format(device, n_gpu)) 134 | 135 | if not args.do_train and not args.do_eval: 136 | raise ValueError("At least one of `do_train` or `do_eval` must be True.") 137 | 138 | if not os.path.exists(args.output_dir): 139 | os.makedirs(args.output_dir) 140 | 141 | # Load tokenizer and model 142 | # This loading functions also add new tokens and embeddings called `special tokens` 143 | # These new embeddings will be fine-tuned on the RocStories dataset 144 | special_tokens = ['_start_', '_delimiter_', '_classify_'] 145 | tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) 146 | special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens) 147 | model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens)) 148 | model.to(device) 149 | 150 | # Load and encode the datasets 151 | if not args.train_dataset and not args.eval_dataset: 152 | roc_stories = cached_path(ROCSTORIES_URL) 153 | def tokenize_and_encode(obj): 154 | """ Tokenize and encode a nested object """ 155 | if isinstance(obj, str): 156 | return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) 157 | elif isinstance(obj, int): 158 | return obj 159 | return list(tokenize_and_encode(o) for o in obj) 160 | logger.info("Encoding dataset...") 161 | train_dataset = load_rocstories_dataset(args.train_dataset) 162 | eval_dataset = load_rocstories_dataset(args.eval_dataset) 163 | datasets = (train_dataset, eval_dataset) 164 | encoded_datasets = tokenize_and_encode(datasets) 165 | 166 | # Compute the max input length for the Transformer 167 | max_length = model.config.n_positions // 2 - 2 168 | input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ 169 | for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) 170 | input_length = min(input_length, model.config.n_positions) # Max size of input for the pre-trained model 171 | 172 | # Prepare inputs tensors and dataloaders 173 | tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) 174 | train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1] 175 | 176 | train_data = TensorDataset(*train_tensor_dataset) 177 | train_sampler = RandomSampler(train_data) 178 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) 179 | 180 | eval_data = TensorDataset(*eval_tensor_dataset) 181 | eval_sampler = SequentialSampler(eval_data) 182 | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) 183 | 184 | # Prepare optimizer 185 | param_optimizer = list(model.named_parameters()) 186 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 187 | optimizer_grouped_parameters = [ 188 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, 189 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 190 | ] 191 | num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size 192 | optimizer = OpenAIAdam(optimizer_grouped_parameters, 193 | lr=args.learning_rate, 194 | warmup=args.warmup_proportion, 195 | max_grad_norm=args.max_grad_norm, 196 | weight_decay=args.weight_decay, 197 | t_total=num_train_optimization_steps) 198 | 199 | if args.do_train: 200 | nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None 201 | model.train() 202 | for _ in trange(int(args.num_train_epochs), desc="Epoch"): 203 | tr_loss = 0 204 | nb_tr_steps = 0 205 | tqdm_bar = tqdm(train_dataloader, desc="Training") 206 | for step, batch in enumerate(tqdm_bar): 207 | batch = tuple(t.to(device) for t in batch) 208 | input_ids, mc_token_ids, lm_labels, mc_labels = batch 209 | losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) 210 | loss = args.lm_coef * losses[0] + losses[1] 211 | loss.backward() 212 | optimizer.step() 213 | optimizer.zero_grad() 214 | tr_loss += loss.item() 215 | exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item() 216 | nb_tr_steps += 1 217 | tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0]) 218 | 219 | # Save a trained model 220 | if args.do_train: 221 | model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self 222 | output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") 223 | config = model.config 224 | torch.save(model_to_save.state_dict(), output_model_file) 225 | 226 | # Load a trained model that you have fine-tuned 227 | model_state_dict = torch.load(output_model_file) 228 | model = OpenAIGPTDoubleHeadsModel(config) 229 | model.load_state_dict(model_state_dict) 230 | model.to(device) 231 | 232 | if args.do_eval: 233 | model.eval() 234 | eval_loss, eval_accuracy = 0, 0 235 | nb_eval_steps, nb_eval_examples = 0, 0 236 | for batch in tqdm(eval_dataloader, desc="Evaluating"): 237 | batch = tuple(t.to(device) for t in batch) 238 | input_ids, mc_token_ids, lm_labels, mc_labels = batch 239 | with torch.no_grad(): 240 | _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels) 241 | _, mc_logits = model(input_ids, mc_token_ids) 242 | 243 | mc_logits = mc_logits.detach().cpu().numpy() 244 | mc_labels = mc_labels.to('cpu').numpy() 245 | tmp_eval_accuracy = accuracy(mc_logits, mc_labels) 246 | 247 | eval_loss += mc_loss.mean().item() 248 | eval_accuracy += tmp_eval_accuracy 249 | 250 | nb_eval_examples += input_ids.size(0) 251 | nb_eval_steps += 1 252 | 253 | eval_loss = eval_loss / nb_eval_steps 254 | eval_accuracy = eval_accuracy / nb_eval_examples 255 | train_loss = tr_loss/nb_tr_steps if args.do_train else None 256 | result = {'eval_loss': eval_loss, 257 | 'eval_accuracy': eval_accuracy, 258 | 'train_loss': train_loss} 259 | 260 | output_eval_file = os.path.join(args.output_dir, "eval_results.txt") 261 | with open(output_eval_file, "w") as writer: 262 | logger.info("***** Eval results *****") 263 | for key in sorted(result.keys()): 264 | logger.info(" %s = %s", key, str(result[key])) 265 | writer.write("%s = %s\n" % (key, str(result[key]))) 266 | 267 | if __name__ == '__main__': 268 | main() 269 | -------------------------------------------------------------------------------- /tests/modeling_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import json 21 | import random 22 | 23 | import torch 24 | 25 | from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM, 26 | BertForNextSentencePrediction, BertForPreTraining, 27 | BertForQuestionAnswering, BertForSequenceClassification, 28 | BertForTokenClassification) 29 | 30 | 31 | class BertModelTest(unittest.TestCase): 32 | class BertModelTester(object): 33 | 34 | def __init__(self, 35 | parent, 36 | batch_size=13, 37 | seq_length=7, 38 | is_training=True, 39 | use_input_mask=True, 40 | use_token_type_ids=True, 41 | use_labels=True, 42 | vocab_size=99, 43 | hidden_size=32, 44 | num_hidden_layers=5, 45 | num_attention_heads=4, 46 | intermediate_size=37, 47 | hidden_act="gelu", 48 | hidden_dropout_prob=0.1, 49 | attention_probs_dropout_prob=0.1, 50 | max_position_embeddings=512, 51 | type_vocab_size=16, 52 | type_sequence_label_size=2, 53 | initializer_range=0.02, 54 | num_labels=3, 55 | scope=None): 56 | self.parent = parent 57 | self.batch_size = batch_size 58 | self.seq_length = seq_length 59 | self.is_training = is_training 60 | self.use_input_mask = use_input_mask 61 | self.use_token_type_ids = use_token_type_ids 62 | self.use_labels = use_labels 63 | self.vocab_size = vocab_size 64 | self.hidden_size = hidden_size 65 | self.num_hidden_layers = num_hidden_layers 66 | self.num_attention_heads = num_attention_heads 67 | self.intermediate_size = intermediate_size 68 | self.hidden_act = hidden_act 69 | self.hidden_dropout_prob = hidden_dropout_prob 70 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 71 | self.max_position_embeddings = max_position_embeddings 72 | self.type_vocab_size = type_vocab_size 73 | self.type_sequence_label_size = type_sequence_label_size 74 | self.initializer_range = initializer_range 75 | self.num_labels = num_labels 76 | self.scope = scope 77 | 78 | def prepare_config_and_inputs(self): 79 | input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size) 80 | 81 | input_mask = None 82 | if self.use_input_mask: 83 | input_mask = BertModelTest.ids_tensor([self.batch_size, self.seq_length], vocab_size=2) 84 | 85 | token_type_ids = None 86 | if self.use_token_type_ids: 87 | token_type_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) 88 | 89 | sequence_labels = None 90 | token_labels = None 91 | if self.use_labels: 92 | sequence_labels = BertModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size) 93 | token_labels = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.num_labels) 94 | 95 | config = BertConfig( 96 | vocab_size_or_config_json_file=self.vocab_size, 97 | hidden_size=self.hidden_size, 98 | num_hidden_layers=self.num_hidden_layers, 99 | num_attention_heads=self.num_attention_heads, 100 | intermediate_size=self.intermediate_size, 101 | hidden_act=self.hidden_act, 102 | hidden_dropout_prob=self.hidden_dropout_prob, 103 | attention_probs_dropout_prob=self.attention_probs_dropout_prob, 104 | max_position_embeddings=self.max_position_embeddings, 105 | type_vocab_size=self.type_vocab_size, 106 | initializer_range=self.initializer_range) 107 | 108 | return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels 109 | 110 | def check_loss_output(self, result): 111 | self.parent.assertListEqual( 112 | list(result["loss"].size()), 113 | []) 114 | 115 | def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): 116 | model = BertModel(config=config) 117 | model.eval() 118 | all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) 119 | outputs = { 120 | "sequence_output": all_encoder_layers[-1], 121 | "pooled_output": pooled_output, 122 | "all_encoder_layers": all_encoder_layers, 123 | } 124 | return outputs 125 | 126 | def check_bert_model_output(self, result): 127 | self.parent.assertListEqual( 128 | [size for layer in result["all_encoder_layers"] for size in layer.size()], 129 | [self.batch_size, self.seq_length, self.hidden_size] * self.num_hidden_layers) 130 | self.parent.assertListEqual( 131 | list(result["sequence_output"].size()), 132 | [self.batch_size, self.seq_length, self.hidden_size]) 133 | self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) 134 | 135 | 136 | def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): 137 | model = BertForMaskedLM(config=config) 138 | model.eval() 139 | loss = model(input_ids, token_type_ids, input_mask, token_labels) 140 | prediction_scores = model(input_ids, token_type_ids, input_mask) 141 | outputs = { 142 | "loss": loss, 143 | "prediction_scores": prediction_scores, 144 | } 145 | return outputs 146 | 147 | def check_bert_for_masked_lm_output(self, result): 148 | self.parent.assertListEqual( 149 | list(result["prediction_scores"].size()), 150 | [self.batch_size, self.seq_length, self.vocab_size]) 151 | 152 | def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): 153 | model = BertForNextSentencePrediction(config=config) 154 | model.eval() 155 | loss = model(input_ids, token_type_ids, input_mask, sequence_labels) 156 | seq_relationship_score = model(input_ids, token_type_ids, input_mask) 157 | outputs = { 158 | "loss": loss, 159 | "seq_relationship_score": seq_relationship_score, 160 | } 161 | return outputs 162 | 163 | def check_bert_for_next_sequence_prediction_output(self, result): 164 | self.parent.assertListEqual( 165 | list(result["seq_relationship_score"].size()), 166 | [self.batch_size, 2]) 167 | 168 | 169 | def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): 170 | model = BertForPreTraining(config=config) 171 | model.eval() 172 | loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels) 173 | prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask) 174 | outputs = { 175 | "loss": loss, 176 | "prediction_scores": prediction_scores, 177 | "seq_relationship_score": seq_relationship_score, 178 | } 179 | return outputs 180 | 181 | def check_bert_for_pretraining_output(self, result): 182 | self.parent.assertListEqual( 183 | list(result["prediction_scores"].size()), 184 | [self.batch_size, self.seq_length, self.vocab_size]) 185 | self.parent.assertListEqual( 186 | list(result["seq_relationship_score"].size()), 187 | [self.batch_size, 2]) 188 | 189 | 190 | def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): 191 | model = BertForQuestionAnswering(config=config) 192 | model.eval() 193 | loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels) 194 | start_logits, end_logits = model(input_ids, token_type_ids, input_mask) 195 | outputs = { 196 | "loss": loss, 197 | "start_logits": start_logits, 198 | "end_logits": end_logits, 199 | } 200 | return outputs 201 | 202 | def check_bert_for_question_answering_output(self, result): 203 | self.parent.assertListEqual( 204 | list(result["start_logits"].size()), 205 | [self.batch_size, self.seq_length]) 206 | self.parent.assertListEqual( 207 | list(result["end_logits"].size()), 208 | [self.batch_size, self.seq_length]) 209 | 210 | 211 | def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): 212 | model = BertForSequenceClassification(config=config, num_labels=self.num_labels) 213 | model.eval() 214 | loss = model(input_ids, token_type_ids, input_mask, sequence_labels) 215 | logits = model(input_ids, token_type_ids, input_mask) 216 | outputs = { 217 | "loss": loss, 218 | "logits": logits, 219 | } 220 | return outputs 221 | 222 | def check_bert_for_sequence_classification_output(self, result): 223 | self.parent.assertListEqual( 224 | list(result["logits"].size()), 225 | [self.batch_size, self.num_labels]) 226 | 227 | 228 | def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): 229 | model = BertForTokenClassification(config=config, num_labels=self.num_labels) 230 | model.eval() 231 | loss = model(input_ids, token_type_ids, input_mask, token_labels) 232 | logits = model(input_ids, token_type_ids, input_mask) 233 | outputs = { 234 | "loss": loss, 235 | "logits": logits, 236 | } 237 | return outputs 238 | 239 | def check_bert_for_token_classification_output(self, result): 240 | self.parent.assertListEqual( 241 | list(result["logits"].size()), 242 | [self.batch_size, self.seq_length, self.num_labels]) 243 | 244 | 245 | def test_default(self): 246 | self.run_tester(BertModelTest.BertModelTester(self)) 247 | 248 | def test_config_to_json_string(self): 249 | config = BertConfig(vocab_size_or_config_json_file=99, hidden_size=37) 250 | obj = json.loads(config.to_json_string()) 251 | self.assertEqual(obj["vocab_size"], 99) 252 | self.assertEqual(obj["hidden_size"], 37) 253 | 254 | def run_tester(self, tester): 255 | config_and_inputs = tester.prepare_config_and_inputs() 256 | output_result = tester.create_bert_model(*config_and_inputs) 257 | tester.check_bert_model_output(output_result) 258 | 259 | output_result = tester.create_bert_for_masked_lm(*config_and_inputs) 260 | tester.check_bert_for_masked_lm_output(output_result) 261 | tester.check_loss_output(output_result) 262 | 263 | output_result = tester.create_bert_for_next_sequence_prediction(*config_and_inputs) 264 | tester.check_bert_for_next_sequence_prediction_output(output_result) 265 | tester.check_loss_output(output_result) 266 | 267 | output_result = tester.create_bert_for_pretraining(*config_and_inputs) 268 | tester.check_bert_for_pretraining_output(output_result) 269 | tester.check_loss_output(output_result) 270 | 271 | output_result = tester.create_bert_for_question_answering(*config_and_inputs) 272 | tester.check_bert_for_question_answering_output(output_result) 273 | tester.check_loss_output(output_result) 274 | 275 | output_result = tester.create_bert_for_sequence_classification(*config_and_inputs) 276 | tester.check_bert_for_sequence_classification_output(output_result) 277 | tester.check_loss_output(output_result) 278 | 279 | output_result = tester.create_bert_for_token_classification(*config_and_inputs) 280 | tester.check_bert_for_token_classification_output(output_result) 281 | tester.check_loss_output(output_result) 282 | 283 | @classmethod 284 | def ids_tensor(cls, shape, vocab_size, rng=None, name=None): 285 | """Creates a random int32 tensor of the shape within the vocab size.""" 286 | if rng is None: 287 | rng = random.Random() 288 | 289 | total_dims = 1 290 | for dim in shape: 291 | total_dims *= dim 292 | 293 | values = [] 294 | for _ in range(total_dims): 295 | values.append(rng.randint(0, vocab_size - 1)) 296 | 297 | return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous() 298 | 299 | 300 | if __name__ == "__main__": 301 | unittest.main() 302 | -------------------------------------------------------------------------------- /examples/lm_finetuning/finetune_on_pregenerated.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from pathlib import Path 3 | import torch 4 | import logging 5 | import json 6 | import random 7 | import numpy as np 8 | from collections import namedtuple 9 | from tempfile import TemporaryDirectory 10 | 11 | from torch.utils.data import DataLoader, Dataset, RandomSampler 12 | from torch.utils.data.distributed import DistributedSampler 13 | from tqdm import tqdm 14 | 15 | from pytorch_pretrained_bert.modeling import BertForPreTraining 16 | from pytorch_pretrained_bert.tokenization import BertTokenizer 17 | from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear 18 | 19 | InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next") 20 | 21 | log_format = '%(asctime)-10s: %(message)s' 22 | logging.basicConfig(level=logging.INFO, format=log_format) 23 | 24 | 25 | def convert_example_to_features(example, tokenizer, max_seq_length): 26 | tokens = example["tokens"] 27 | segment_ids = example["segment_ids"] 28 | is_random_next = example["is_random_next"] 29 | masked_lm_positions = example["masked_lm_positions"] 30 | masked_lm_labels = example["masked_lm_labels"] 31 | 32 | assert len(tokens) == len(segment_ids) <= max_seq_length # The preprocessed data should be already truncated 33 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 34 | masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels) 35 | 36 | input_array = np.zeros(max_seq_length, dtype=np.int) 37 | input_array[:len(input_ids)] = input_ids 38 | 39 | mask_array = np.zeros(max_seq_length, dtype=np.bool) 40 | mask_array[:len(input_ids)] = 1 41 | 42 | segment_array = np.zeros(max_seq_length, dtype=np.bool) 43 | segment_array[:len(segment_ids)] = segment_ids 44 | 45 | lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1) 46 | lm_label_array[masked_lm_positions] = masked_label_ids 47 | 48 | features = InputFeatures(input_ids=input_array, 49 | input_mask=mask_array, 50 | segment_ids=segment_array, 51 | lm_label_ids=lm_label_array, 52 | is_next=is_random_next) 53 | return features 54 | 55 | 56 | class PregeneratedDataset(Dataset): 57 | def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memory=False): 58 | self.vocab = tokenizer.vocab 59 | self.tokenizer = tokenizer 60 | self.epoch = epoch 61 | self.data_epoch = epoch % num_data_epochs 62 | data_file = training_path / f"epoch_{self.data_epoch}.json" 63 | metrics_file = training_path / f"epoch_{self.data_epoch}_metrics.json" 64 | assert data_file.is_file() and metrics_file.is_file() 65 | metrics = json.loads(metrics_file.read_text()) 66 | num_samples = metrics['num_training_examples'] 67 | seq_len = metrics['max_seq_len'] 68 | self.temp_dir = None 69 | self.working_dir = None 70 | if reduce_memory: 71 | self.temp_dir = TemporaryDirectory() 72 | self.working_dir = Path(self.temp_dir.name) 73 | input_ids = np.memmap(filename=self.working_dir/'input_ids.memmap', 74 | mode='w+', dtype=np.int32, shape=(num_samples, seq_len)) 75 | input_masks = np.memmap(filename=self.working_dir/'input_masks.memmap', 76 | shape=(num_samples, seq_len), mode='w+', dtype=np.bool) 77 | segment_ids = np.memmap(filename=self.working_dir/'input_masks.memmap', 78 | shape=(num_samples, seq_len), mode='w+', dtype=np.bool) 79 | lm_label_ids = np.memmap(filename=self.working_dir/'lm_label_ids.memmap', 80 | shape=(num_samples, seq_len), mode='w+', dtype=np.int32) 81 | lm_label_ids[:] = -1 82 | is_nexts = np.memmap(filename=self.working_dir/'is_nexts.memmap', 83 | shape=(num_samples,), mode='w+', dtype=np.bool) 84 | else: 85 | input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32) 86 | input_masks = np.zeros(shape=(num_samples, seq_len), dtype=np.bool) 87 | segment_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.bool) 88 | lm_label_ids = np.full(shape=(num_samples, seq_len), dtype=np.int32, fill_value=-1) 89 | is_nexts = np.zeros(shape=(num_samples,), dtype=np.bool) 90 | logging.info(f"Loading training examples for epoch {epoch}") 91 | with data_file.open() as f: 92 | for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")): 93 | line = line.strip() 94 | example = json.loads(line) 95 | features = convert_example_to_features(example, tokenizer, seq_len) 96 | input_ids[i] = features.input_ids 97 | segment_ids[i] = features.segment_ids 98 | input_masks[i] = features.input_mask 99 | lm_label_ids[i] = features.lm_label_ids 100 | is_nexts[i] = features.is_next 101 | assert i == num_samples - 1 # Assert that the sample count metric was true 102 | logging.info("Loading complete!") 103 | self.num_samples = num_samples 104 | self.seq_len = seq_len 105 | self.input_ids = input_ids 106 | self.input_masks = input_masks 107 | self.segment_ids = segment_ids 108 | self.lm_label_ids = lm_label_ids 109 | self.is_nexts = is_nexts 110 | 111 | def __len__(self): 112 | return self.num_samples 113 | 114 | def __getitem__(self, item): 115 | return (torch.tensor(self.input_ids[item].astype(np.int64)), 116 | torch.tensor(self.input_masks[item].astype(np.int64)), 117 | torch.tensor(self.segment_ids[item].astype(np.int64)), 118 | torch.tensor(self.lm_label_ids[item].astype(np.int64)), 119 | torch.tensor(self.is_nexts[item].astype(np.int64))) 120 | 121 | 122 | def main(): 123 | parser = ArgumentParser() 124 | parser.add_argument('--pregenerated_data', type=Path, required=True) 125 | parser.add_argument('--output_dir', type=Path, required=True) 126 | parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " 127 | "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") 128 | parser.add_argument("--do_lower_case", action="store_true") 129 | parser.add_argument("--reduce_memory", action="store_true", 130 | help="Store training data as on-disc memmaps to massively reduce memory usage") 131 | 132 | parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") 133 | parser.add_argument("--local_rank", 134 | type=int, 135 | default=-1, 136 | help="local_rank for distributed training on gpus") 137 | parser.add_argument("--no_cuda", 138 | action='store_true', 139 | help="Whether not to use CUDA when available") 140 | parser.add_argument('--gradient_accumulation_steps', 141 | type=int, 142 | default=1, 143 | help="Number of updates steps to accumulate before performing a backward/update pass.") 144 | parser.add_argument("--train_batch_size", 145 | default=32, 146 | type=int, 147 | help="Total batch size for training.") 148 | parser.add_argument('--fp16', 149 | action='store_true', 150 | help="Whether to use 16-bit float precision instead of 32-bit") 151 | parser.add_argument('--loss_scale', 152 | type=float, default=0, 153 | help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" 154 | "0 (default value): dynamic loss scaling.\n" 155 | "Positive power of 2: static loss scaling value.\n") 156 | parser.add_argument("--warmup_proportion", 157 | default=0.1, 158 | type=float, 159 | help="Proportion of training to perform linear learning rate warmup for. " 160 | "E.g., 0.1 = 10%% of training.") 161 | parser.add_argument("--learning_rate", 162 | default=3e-5, 163 | type=float, 164 | help="The initial learning rate for Adam.") 165 | parser.add_argument('--seed', 166 | type=int, 167 | default=42, 168 | help="random seed for initialization") 169 | args = parser.parse_args() 170 | 171 | assert args.pregenerated_data.is_dir(), \ 172 | "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" 173 | 174 | samples_per_epoch = [] 175 | for i in range(args.epochs): 176 | epoch_file = args.pregenerated_data / f"epoch_{i}.json" 177 | metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" 178 | if epoch_file.is_file() and metrics_file.is_file(): 179 | metrics = json.loads(metrics_file.read_text()) 180 | samples_per_epoch.append(metrics['num_training_examples']) 181 | else: 182 | if i == 0: 183 | exit("No training data was found!") 184 | print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).") 185 | print("This script will loop over the available data, but training diversity may be negatively impacted.") 186 | num_data_epochs = i 187 | break 188 | else: 189 | num_data_epochs = args.epochs 190 | 191 | if args.local_rank == -1 or args.no_cuda: 192 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 193 | n_gpu = torch.cuda.device_count() 194 | else: 195 | torch.cuda.set_device(args.local_rank) 196 | device = torch.device("cuda", args.local_rank) 197 | n_gpu = 1 198 | # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 199 | torch.distributed.init_process_group(backend='nccl') 200 | logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( 201 | device, n_gpu, bool(args.local_rank != -1), args.fp16)) 202 | 203 | if args.gradient_accumulation_steps < 1: 204 | raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( 205 | args.gradient_accumulation_steps)) 206 | 207 | args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps 208 | 209 | random.seed(args.seed) 210 | np.random.seed(args.seed) 211 | torch.manual_seed(args.seed) 212 | if n_gpu > 0: 213 | torch.cuda.manual_seed_all(args.seed) 214 | 215 | if args.output_dir.is_dir() and list(args.output_dir.iterdir()): 216 | logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!") 217 | args.output_dir.mkdir(parents=True, exist_ok=True) 218 | 219 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 220 | 221 | total_train_examples = 0 222 | for i in range(args.epochs): 223 | # The modulo takes into account the fact that we may loop over limited epochs of data 224 | total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] 225 | 226 | num_train_optimization_steps = int( 227 | total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) 228 | if args.local_rank != -1: 229 | num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() 230 | 231 | # Prepare model 232 | model = BertForPreTraining.from_pretrained(args.bert_model) 233 | if args.fp16: 234 | model.half() 235 | model.to(device) 236 | if args.local_rank != -1: 237 | try: 238 | from apex.parallel import DistributedDataParallel as DDP 239 | except ImportError: 240 | raise ImportError( 241 | "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") 242 | model = DDP(model) 243 | elif n_gpu > 1: 244 | model = torch.nn.DataParallel(model) 245 | 246 | # Prepare optimizer 247 | param_optimizer = list(model.named_parameters()) 248 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 249 | optimizer_grouped_parameters = [ 250 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 251 | 'weight_decay': 0.01}, 252 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 253 | ] 254 | 255 | if args.fp16: 256 | try: 257 | from apex.optimizers import FP16_Optimizer 258 | from apex.optimizers import FusedAdam 259 | except ImportError: 260 | raise ImportError( 261 | "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") 262 | 263 | optimizer = FusedAdam(optimizer_grouped_parameters, 264 | lr=args.learning_rate, 265 | bias_correction=False, 266 | max_grad_norm=1.0) 267 | if args.loss_scale == 0: 268 | optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) 269 | else: 270 | optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) 271 | 272 | else: 273 | optimizer = BertAdam(optimizer_grouped_parameters, 274 | lr=args.learning_rate, 275 | warmup=args.warmup_proportion, 276 | t_total=num_train_optimization_steps) 277 | 278 | global_step = 0 279 | logging.info("***** Running training *****") 280 | logging.info(f" Num examples = {total_train_examples}") 281 | logging.info(" Batch size = %d", args.train_batch_size) 282 | logging.info(" Num steps = %d", num_train_optimization_steps) 283 | model.train() 284 | for epoch in range(args.epochs): 285 | epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, 286 | num_data_epochs=num_data_epochs) 287 | if args.local_rank == -1: 288 | train_sampler = RandomSampler(epoch_dataset) 289 | else: 290 | train_sampler = DistributedSampler(epoch_dataset) 291 | train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) 292 | tr_loss = 0 293 | nb_tr_examples, nb_tr_steps = 0, 0 294 | with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: 295 | for step, batch in enumerate(train_dataloader): 296 | batch = tuple(t.to(device) for t in batch) 297 | input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch 298 | loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) 299 | if n_gpu > 1: 300 | loss = loss.mean() # mean() to average on multi-gpu. 301 | if args.gradient_accumulation_steps > 1: 302 | loss = loss / args.gradient_accumulation_steps 303 | if args.fp16: 304 | optimizer.backward(loss) 305 | else: 306 | loss.backward() 307 | tr_loss += loss.item() 308 | nb_tr_examples += input_ids.size(0) 309 | nb_tr_steps += 1 310 | pbar.update(1) 311 | mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps 312 | pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") 313 | if (step + 1) % args.gradient_accumulation_steps == 0: 314 | if args.fp16: 315 | # modify learning rate with special warm up BERT uses 316 | # if args.fp16 is False, BertAdam is used that handles this automatically 317 | lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, 318 | args.warmup_proportion) 319 | for param_group in optimizer.param_groups: 320 | param_group['lr'] = lr_this_step 321 | optimizer.step() 322 | optimizer.zero_grad() 323 | global_step += 1 324 | 325 | # Save a trained model 326 | logging.info("** ** * Saving fine-tuned model ** ** * ") 327 | model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self 328 | output_model_file = args.output_dir / "pytorch_model.bin" 329 | torch.save(model_to_save.state_dict(), str(output_model_file)) 330 | 331 | 332 | if __name__ == '__main__': 333 | main() 334 | --------------------------------------------------------------------------------