├── sentence_similarity_Bert ├── pytorch_pretrained_bert │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── modeling.cpython-36.pyc │ │ ├── file_utils.cpython-36.pyc │ │ ├── modeling_gpt2.cpython-36.pyc │ │ ├── optimization.cpython-36.pyc │ │ ├── tokenization.cpython-36.pyc │ │ ├── modeling_openai.cpython-36.pyc │ │ ├── tokenization_gpt2.cpython-36.pyc │ │ ├── modeling_transfo_xl.cpython-36.pyc │ │ ├── optimization_openai.cpython-36.pyc │ │ ├── tokenization_openai.cpython-36.pyc │ │ ├── tokenization_transfo_xl.cpython-36.pyc │ │ └── modeling_transfo_xl_utilities.cpython-36.pyc │ ├── __init__.py │ ├── convert_tf_checkpoint_to_pytorch.py │ ├── convert_gpt2_checkpoint_to_pytorch.py │ ├── convert_openai_checkpoint_to_pytorch.py │ ├── __main__.py │ ├── convert_transfo_xl_checkpoint_to_pytorch.py │ ├── optimization_openai.py │ ├── optimization.py │ ├── file_utils.py │ ├── tokenization_gpt2.py │ ├── tokenization_openai.py │ ├── tokenization.py │ └── modeling_transfo_xl_utilities.py ├── examples │ ├── models │ │ └── chinese_L-12_H-768_A-12 │ │ │ ├── bert_model.ckpt.index │ │ │ ├── bert_model.ckpt.meta │ │ │ └── bert_config.json │ ├── run_classifier_class.py │ ├── extract_features.py │ └── run_classifier_modify2.py ├── requirements.txt ├── tests │ ├── optimization_test.py │ ├── tokenization_openai_test.py │ ├── tokenization_transfo_xl_test.py │ ├── tokenization_test.py │ ├── modeling_gpt2_test.py │ ├── modeling_transfo_xl_test.py │ ├── modeling_openai_test.py │ └── modeling_test.py └── setup.py └── README.md /sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_similarity_Bert/examples/models/chinese_L-12_H-768_A-12/bert_model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/examples/models/chinese_L-12_H-768_A-12/bert_model.ckpt.index -------------------------------------------------------------------------------- /sentence_similarity_Bert/examples/models/chinese_L-12_H-768_A-12/bert_model.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/examples/models/chinese_L-12_H-768_A-12/bert_model.ckpt.meta -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/file_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/file_utils.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_gpt2.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_gpt2.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/optimization.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/optimization.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_openai.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_openai.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization_gpt2.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization_gpt2.cpython-36.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bert_sentence_similarity 2 | 3 | Bert预训练模型fine-tune计算文本相似度 4 | 5 | 1) 运行 ./sentence_similarity_Bert/examples/run_classifier_modify2 进行fine-tune 6 | 7 | 2) 训练数据集为蚂蚁金服文本匹配的数据 在chinese_data文件夹内 8 | 9 | 3) 运行run_classifier_class进行测试 10 | 11 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_transfo_xl.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_transfo_xl.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/optimization_openai.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/optimization_openai.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization_openai.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization_openai.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization_transfo_xl.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization_transfo_xl.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_similarity_Bert/requirements.txt: -------------------------------------------------------------------------------- 1 | # PyTorch 2 | torch>=0.4.1 3 | # progress bars in model download and training scripts 4 | tqdm 5 | # Accessing files from S3 directly. 6 | boto3 7 | # Used for downloading models over HTTP 8 | requests 9 | # For OpenAI GPT 10 | regex -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_transfo_xl_utilities.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_transfo_xl_utilities.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_similarity_Bert/examples/models/chinese_L-12_H-768_A-12/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "directionality": "bidi", 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.1, 6 | "hidden_size": 768, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 3072, 9 | "max_position_embeddings": 512, 10 | "num_attention_heads": 12, 11 | "num_hidden_layers": 12, 12 | "pooler_fc_size": 768, 13 | "pooler_num_attention_heads": 12, 14 | "pooler_num_fc_layers": 3, 15 | "pooler_size_per_head": 128, 16 | "pooler_type": "first_token_transform", 17 | "type_vocab_size": 2, 18 | "vocab_size": 21128 19 | } 20 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.6.1" 2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer 3 | from .tokenization_openai import OpenAIGPTTokenizer 4 | from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) 5 | from .tokenization_gpt2 import GPT2Tokenizer 6 | 7 | from .modeling import (BertConfig, BertModel, BertForPreTraining, 8 | BertForMaskedLM, BertForNextSentencePrediction, 9 | BertForSequenceClassification, BertForMultipleChoice, 10 | BertForTokenClassification, BertForQuestionAnswering, 11 | load_tf_weights_in_bert) 12 | from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel, 13 | OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, 14 | load_tf_weights_in_openai_gpt) 15 | from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel, 16 | load_tf_weights_in_transfo_xl) 17 | from .modeling_gpt2 import (GPT2Config, GPT2Model, 18 | GPT2LMHeadModel, GPT2DoubleHeadsModel, 19 | load_tf_weights_in_gpt2) 20 | 21 | from .optimization import BertAdam 22 | from .optimization_openai import OpenAIAdam 23 | 24 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path 25 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/tests/optimization_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | 21 | import torch 22 | 23 | from pytorch_pretrained_bert import BertAdam 24 | 25 | class OptimizationTest(unittest.TestCase): 26 | 27 | def assertListAlmostEqual(self, list1, list2, tol): 28 | self.assertEqual(len(list1), len(list2)) 29 | for a, b in zip(list1, list2): 30 | self.assertAlmostEqual(a, b, delta=tol) 31 | 32 | def test_adam(self): 33 | w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True) 34 | target = torch.tensor([0.4, 0.2, -0.5]) 35 | criterion = torch.nn.MSELoss() 36 | # No warmup, constant schedule, no gradient clipping 37 | optimizer = BertAdam(params=[w], lr=2e-1, 38 | weight_decay=0.0, 39 | max_grad_norm=-1) 40 | for _ in range(100): 41 | loss = criterion(w, target) 42 | loss.backward() 43 | optimizer.step() 44 | w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves. 45 | w.grad.zero_() 46 | self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2) 47 | 48 | 49 | if __name__ == "__main__": 50 | unittest.main() 51 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/tests/tokenization_openai_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | import json 20 | 21 | from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer 22 | 23 | 24 | class OpenAIGPTTokenizationTest(unittest.TestCase): 25 | 26 | def test_full_tokenizer(self): 27 | """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """ 28 | vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", 29 | "w", "r", "t", 30 | "lo", "low", "er", 31 | "low", "lowest", "newer", "wider"] 32 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 33 | merges = ["#version: 0.2", "l o", "lo w", "e r", ""] 34 | with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp: 35 | json.dump(vocab_tokens, fp) 36 | vocab_file = fp.name 37 | with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp: 38 | fp.write("\n".join(merges)) 39 | merges_file = fp.name 40 | 41 | tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=[""]) 42 | os.remove(vocab_file) 43 | os.remove(merges_file) 44 | 45 | text = "lower" 46 | bpe_tokens = ["low", "er"] 47 | tokens = tokenizer.tokenize(text) 48 | self.assertListEqual(tokens, bpe_tokens) 49 | 50 | input_tokens = tokens + [""] 51 | input_bpe_tokens = [14, 15, 20] 52 | self.assertListEqual( 53 | tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 54 | 55 | if __name__ == '__main__': 56 | unittest.main() 57 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import re 23 | import argparse 24 | import tensorflow as tf 25 | import torch 26 | import numpy as np 27 | 28 | from pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert 29 | 30 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 31 | # Initialise PyTorch model 32 | config = BertConfig.from_json_file(bert_config_file) 33 | print("Building PyTorch model from configuration: {}".format(str(config))) 34 | model = BertForPreTraining(config) 35 | 36 | # Load weights from tf checkpoint 37 | load_tf_weights_in_bert(model, tf_checkpoint_path) 38 | 39 | # Save pytorch-model 40 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 41 | torch.save(model.state_dict(), pytorch_dump_path) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | ## Required parameters 47 | parser.add_argument("--tf_checkpoint_path", 48 | default = None, 49 | type = str, 50 | required = True, 51 | help = "Path the TensorFlow checkpoint path.") 52 | parser.add_argument("--bert_config_file", 53 | default = None, 54 | type = str, 55 | required = True, 56 | help = "The config json file corresponding to the pre-trained BERT model. \n" 57 | "This specifies the model architecture.") 58 | parser.add_argument("--pytorch_dump_path", 59 | default = None, 60 | type = str, 61 | required = True, 62 | help = "Path to the output PyTorch model.") 63 | args = parser.parse_args() 64 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, 65 | args.bert_config_file, 66 | args.pytorch_dump_path) 67 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py 3 | 4 | To create the package for pypi. 5 | 6 | 1. Change the version in __init__.py and setup.py. 7 | 8 | 2. Commit these changes with the message: "Release: VERSION" 9 | 10 | 3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' " 11 | Push the tag to git: git push --tags origin master 12 | 13 | 4. Build both the sources and the wheel. Do not change anything in setup.py between 14 | creating the wheel and the source distribution (obviously). 15 | 16 | For the wheel, run: "python setup.py bdist_wheel" in the top level allennlp directory. 17 | (this will build a wheel for the python version you use to build it - make sure you use python 3.x). 18 | 19 | For the sources, run: "python setup.py sdist" 20 | You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp. 21 | 22 | 5. Check that everything looks correct by uploading the package to the pypi test server: 23 | 24 | twine upload dist/* -r pypitest 25 | (pypi suggest using twine as other methods upload files via plaintext.) 26 | 27 | Check that you can install it in a virtualenv by running: 28 | pip install -i https://testpypi.python.org/pypi allennlp 29 | 30 | 6. Upload the final version to actual pypi: 31 | twine upload dist/* -r pypi 32 | 33 | 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory. 34 | 35 | """ 36 | from io import open 37 | from setuptools import find_packages, setup 38 | 39 | setup( 40 | name="pytorch_pretrained_bert", 41 | version="0.6.1", 42 | author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors", 43 | author_email="thomas@huggingface.co", 44 | description="PyTorch version of Google AI BERT model with script to load Google pre-trained models", 45 | long_description=open("README.md", "r", encoding='utf-8').read(), 46 | long_description_content_type="text/markdown", 47 | keywords='BERT NLP deep learning google', 48 | license='Apache', 49 | url="https://github.com/huggingface/pytorch-pretrained-BERT", 50 | packages=find_packages(exclude=["*.tests", "*.tests.*", 51 | "tests.*", "tests"]), 52 | install_requires=['torch>=0.4.1', 53 | 'numpy', 54 | 'boto3', 55 | 'requests', 56 | 'tqdm', 57 | 'regex'], 58 | entry_points={ 59 | 'console_scripts': [ 60 | "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main", 61 | ] 62 | }, 63 | # python_requires='>=3.5.0', 64 | tests_require=['pytest'], 65 | classifiers=[ 66 | 'Intended Audience :: Science/Research', 67 | 'License :: OSI Approved :: Apache Software License', 68 | 'Programming Language :: Python :: 3', 69 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 70 | ], 71 | ) 72 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | from io import open 21 | 22 | import torch 23 | 24 | from pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME, 25 | GPT2Config, 26 | GPT2Model, 27 | load_tf_weights_in_gpt2) 28 | 29 | 30 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): 31 | # Construct model 32 | if gpt2_config_file == "": 33 | config = GPT2Config() 34 | else: 35 | config = GPT2Config(gpt2_config_file) 36 | model = GPT2Model(config) 37 | 38 | # Load weights from numpy 39 | load_tf_weights_in_gpt2(model, gpt2_checkpoint_path) 40 | 41 | # Save pytorch-model 42 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 43 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 44 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 45 | torch.save(model.state_dict(), pytorch_weights_dump_path) 46 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 47 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 48 | f.write(config.to_json_string()) 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | ## Required parameters 54 | parser.add_argument("--gpt2_checkpoint_path", 55 | default = None, 56 | type = str, 57 | required = True, 58 | help = "Path the TensorFlow checkpoint path.") 59 | parser.add_argument("--pytorch_dump_folder_path", 60 | default = None, 61 | type = str, 62 | required = True, 63 | help = "Path to the output PyTorch model.") 64 | parser.add_argument("--gpt2_config_file", 65 | default = "", 66 | type = str, 67 | help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" 68 | "This specifies the model architecture.") 69 | args = parser.parse_args() 70 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, 71 | args.gpt2_config_file, 72 | args.pytorch_dump_folder_path) 73 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | from io import open 21 | 22 | import torch 23 | 24 | from pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME, 25 | OpenAIGPTConfig, 26 | OpenAIGPTModel, 27 | load_tf_weights_in_openai_gpt) 28 | 29 | 30 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): 31 | # Construct model 32 | if openai_config_file == "": 33 | config = OpenAIGPTConfig() 34 | else: 35 | config = OpenAIGPTConfig(openai_config_file) 36 | model = OpenAIGPTModel(config) 37 | 38 | # Load weights from numpy 39 | load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path) 40 | 41 | # Save pytorch-model 42 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 43 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 44 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 45 | torch.save(model.state_dict(), pytorch_weights_dump_path) 46 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 47 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 48 | f.write(config.to_json_string()) 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | ## Required parameters 54 | parser.add_argument("--openai_checkpoint_folder_path", 55 | default = None, 56 | type = str, 57 | required = True, 58 | help = "Path the TensorFlow checkpoint path.") 59 | parser.add_argument("--pytorch_dump_folder_path", 60 | default = None, 61 | type = str, 62 | required = True, 63 | help = "Path to the output PyTorch model.") 64 | parser.add_argument("--openai_config_file", 65 | default = "", 66 | type = str, 67 | help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" 68 | "This specifies the model architecture.") 69 | args = parser.parse_args() 70 | convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path, 71 | args.openai_config_file, 72 | args.pytorch_dump_folder_path) 73 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/tests/tokenization_transfo_xl_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | from io import open 20 | 21 | from pytorch_pretrained_bert.tokenization_transfo_xl import (TransfoXLTokenizer, 22 | _is_control, _is_punctuation, 23 | _is_whitespace) 24 | 25 | 26 | class TransfoXLTokenizationTest(unittest.TestCase): 27 | 28 | def test_full_tokenizer(self): 29 | vocab_tokens = [ 30 | "", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un", "running", "," 31 | ] 32 | with open("/tmp/transfo_xl_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer: 33 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) 34 | vocab_file = vocab_writer.name 35 | 36 | tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True) 37 | tokenizer.build_vocab() 38 | os.remove(vocab_file) 39 | 40 | tokens = tokenizer.tokenize(u" UNwant\u00E9d,running") 41 | self.assertListEqual(tokens, ["", "unwanted", ",", "running"]) 42 | 43 | self.assertListEqual( 44 | tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7]) 45 | 46 | def test_full_tokenizer_lower(self): 47 | tokenizer = TransfoXLTokenizer(lower_case=True) 48 | 49 | self.assertListEqual( 50 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), 51 | ["hello", "!", "how", "are", "you", "?"]) 52 | self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 53 | 54 | def test_full_tokenizer_no_lower(self): 55 | tokenizer = TransfoXLTokenizer(lower_case=False) 56 | 57 | self.assertListEqual( 58 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), 59 | ["HeLLo", "!", "how", "Are", "yoU", "?"]) 60 | 61 | def test_is_whitespace(self): 62 | self.assertTrue(_is_whitespace(u" ")) 63 | self.assertTrue(_is_whitespace(u"\t")) 64 | self.assertTrue(_is_whitespace(u"\r")) 65 | self.assertTrue(_is_whitespace(u"\n")) 66 | self.assertTrue(_is_whitespace(u"\u00A0")) 67 | 68 | self.assertFalse(_is_whitespace(u"A")) 69 | self.assertFalse(_is_whitespace(u"-")) 70 | 71 | def test_is_control(self): 72 | self.assertTrue(_is_control(u"\u0005")) 73 | 74 | self.assertFalse(_is_control(u"A")) 75 | self.assertFalse(_is_control(u" ")) 76 | self.assertFalse(_is_control(u"\t")) 77 | self.assertFalse(_is_control(u"\r")) 78 | 79 | def test_is_punctuation(self): 80 | self.assertTrue(_is_punctuation(u"-")) 81 | self.assertTrue(_is_punctuation(u"$")) 82 | self.assertTrue(_is_punctuation(u"`")) 83 | self.assertTrue(_is_punctuation(u".")) 84 | 85 | self.assertFalse(_is_punctuation(u"A")) 86 | self.assertFalse(_is_punctuation(u" ")) 87 | 88 | 89 | if __name__ == '__main__': 90 | unittest.main() 91 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/__main__.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | def main(): 3 | import sys 4 | if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [ 5 | "convert_tf_checkpoint_to_pytorch", 6 | "convert_openai_checkpoint", 7 | "convert_transfo_xl_checkpoint", 8 | "convert_gpt2_checkpoint", 9 | ]: 10 | print( 11 | "Should be used as one of: \n" 12 | ">> `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n" 13 | ">> `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n" 14 | ">> `pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n" 15 | ">> `pytorch_pretrained_bert convert_gpt2_checkpoint TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]`") 16 | else: 17 | if sys.argv[1] == "convert_tf_checkpoint_to_pytorch": 18 | try: 19 | from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch 20 | except ImportError: 21 | print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " 22 | "In that case, it requires TensorFlow to be installed. Please see " 23 | "https://www.tensorflow.org/install/ for installation instructions.") 24 | raise 25 | 26 | if len(sys.argv) != 5: 27 | # pylint: disable=line-too-long 28 | print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") 29 | else: 30 | PYTORCH_DUMP_OUTPUT = sys.argv.pop() 31 | TF_CONFIG = sys.argv.pop() 32 | TF_CHECKPOINT = sys.argv.pop() 33 | convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) 34 | elif sys.argv[1] == "convert_openai_checkpoint": 35 | from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch 36 | OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2] 37 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 38 | if len(sys.argv) == 5: 39 | OPENAI_GPT_CONFIG = sys.argv[4] 40 | else: 41 | OPENAI_GPT_CONFIG = "" 42 | convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH, 43 | OPENAI_GPT_CONFIG, 44 | PYTORCH_DUMP_OUTPUT) 45 | elif sys.argv[1] == "convert_transfo_xl_checkpoint": 46 | try: 47 | from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch 48 | except ImportError: 49 | print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " 50 | "In that case, it requires TensorFlow to be installed. Please see " 51 | "https://www.tensorflow.org/install/ for installation instructions.") 52 | raise 53 | 54 | if 'ckpt' in sys.argv[2].lower(): 55 | TF_CHECKPOINT = sys.argv[2] 56 | TF_DATASET_FILE = "" 57 | else: 58 | TF_DATASET_FILE = sys.argv[2] 59 | TF_CHECKPOINT = "" 60 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 61 | if len(sys.argv) == 5: 62 | TF_CONFIG = sys.argv[4] 63 | else: 64 | TF_CONFIG = "" 65 | convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE) 66 | else: 67 | try: 68 | from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch 69 | except ImportError: 70 | print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " 71 | "In that case, it requires TensorFlow to be installed. Please see " 72 | "https://www.tensorflow.org/install/ for installation instructions.") 73 | raise 74 | 75 | TF_CHECKPOINT = sys.argv[2] 76 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 77 | if len(sys.argv) == 5: 78 | TF_CONFIG = sys.argv[4] 79 | else: 80 | TF_CONFIG = "" 81 | convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/tests/tokenization_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | from io import open 20 | 21 | from pytorch_pretrained_bert.tokenization import (BasicTokenizer, 22 | BertTokenizer, 23 | WordpieceTokenizer, 24 | _is_control, _is_punctuation, 25 | _is_whitespace) 26 | 27 | 28 | class TokenizationTest(unittest.TestCase): 29 | 30 | def test_full_tokenizer(self): 31 | vocab_tokens = [ 32 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", 33 | "##ing", "," 34 | ] 35 | with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer: 36 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) 37 | 38 | vocab_file = vocab_writer.name 39 | 40 | tokenizer = BertTokenizer(vocab_file) 41 | os.remove(vocab_file) 42 | 43 | tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") 44 | self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) 45 | 46 | self.assertListEqual( 47 | tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) 48 | 49 | def test_full_tokenizer_raises_error_for_long_sequences(self): 50 | vocab_tokens = [ 51 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", 52 | "##ing", "," 53 | ] 54 | with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer: 55 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) 56 | vocab_file = vocab_writer.name 57 | 58 | tokenizer = BertTokenizer(vocab_file, max_len=10) 59 | os.remove(vocab_file) 60 | tokens = tokenizer.tokenize(u"the cat sat on the mat in the summer time") 61 | indices = tokenizer.convert_tokens_to_ids(tokens) 62 | self.assertListEqual(indices, [0 for _ in range(10)]) 63 | 64 | tokens = tokenizer.tokenize(u"the cat sat on the mat in the summer time .") 65 | self.assertRaises(ValueError, tokenizer.convert_tokens_to_ids, tokens) 66 | 67 | def test_chinese(self): 68 | tokenizer = BasicTokenizer() 69 | 70 | self.assertListEqual( 71 | tokenizer.tokenize(u"ah\u535A\u63A8zz"), 72 | [u"ah", u"\u535A", u"\u63A8", u"zz"]) 73 | 74 | def test_basic_tokenizer_lower(self): 75 | tokenizer = BasicTokenizer(do_lower_case=True) 76 | 77 | self.assertListEqual( 78 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), 79 | ["hello", "!", "how", "are", "you", "?"]) 80 | self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 81 | 82 | def test_basic_tokenizer_no_lower(self): 83 | tokenizer = BasicTokenizer(do_lower_case=False) 84 | 85 | self.assertListEqual( 86 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), 87 | ["HeLLo", "!", "how", "Are", "yoU", "?"]) 88 | 89 | def test_wordpiece_tokenizer(self): 90 | vocab_tokens = [ 91 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", 92 | "##ing" 93 | ] 94 | 95 | vocab = {} 96 | for (i, token) in enumerate(vocab_tokens): 97 | vocab[token] = i 98 | tokenizer = WordpieceTokenizer(vocab=vocab) 99 | 100 | self.assertListEqual(tokenizer.tokenize(""), []) 101 | 102 | self.assertListEqual( 103 | tokenizer.tokenize("unwanted running"), 104 | ["un", "##want", "##ed", "runn", "##ing"]) 105 | 106 | self.assertListEqual( 107 | tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) 108 | 109 | def test_is_whitespace(self): 110 | self.assertTrue(_is_whitespace(u" ")) 111 | self.assertTrue(_is_whitespace(u"\t")) 112 | self.assertTrue(_is_whitespace(u"\r")) 113 | self.assertTrue(_is_whitespace(u"\n")) 114 | self.assertTrue(_is_whitespace(u"\u00A0")) 115 | 116 | self.assertFalse(_is_whitespace(u"A")) 117 | self.assertFalse(_is_whitespace(u"-")) 118 | 119 | def test_is_control(self): 120 | self.assertTrue(_is_control(u"\u0005")) 121 | 122 | self.assertFalse(_is_control(u"A")) 123 | self.assertFalse(_is_control(u" ")) 124 | self.assertFalse(_is_control(u"\t")) 125 | self.assertFalse(_is_control(u"\r")) 126 | 127 | def test_is_punctuation(self): 128 | self.assertTrue(_is_punctuation(u"-")) 129 | self.assertTrue(_is_punctuation(u"$")) 130 | self.assertTrue(_is_punctuation(u"`")) 131 | self.assertTrue(_is_punctuation(u".")) 132 | 133 | self.assertFalse(_is_punctuation(u"A")) 134 | self.assertFalse(_is_punctuation(u" ")) 135 | 136 | 137 | if __name__ == '__main__': 138 | unittest.main() 139 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert Transformer XL checkpoint and datasets.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import os 21 | import sys 22 | from io import open 23 | 24 | import torch 25 | 26 | import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils 27 | from pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME, 28 | WEIGHTS_NAME, 29 | TransfoXLConfig, 30 | TransfoXLLMHeadModel, 31 | load_tf_weights_in_transfo_xl) 32 | from pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME, 33 | VOCAB_NAME) 34 | 35 | if sys.version_info[0] == 2: 36 | import cPickle as pickle 37 | else: 38 | import pickle 39 | 40 | # We do this to be able to load python 2 datasets pickles 41 | # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918 42 | data_utils.Vocab = data_utils.TransfoXLTokenizer 43 | data_utils.Corpus = data_utils.TransfoXLCorpus 44 | sys.modules['data_utils'] = data_utils 45 | sys.modules['vocabulary'] = data_utils 46 | 47 | def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path, 48 | transfo_xl_config_file, 49 | pytorch_dump_folder_path, 50 | transfo_xl_dataset_file): 51 | if transfo_xl_dataset_file: 52 | # Convert a pre-processed corpus (see original TensorFlow repo) 53 | with open(transfo_xl_dataset_file, "rb") as fp: 54 | corpus = pickle.load(fp, encoding="latin1") 55 | # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) 56 | pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME 57 | print("Save vocabulary to {}".format(pytorch_vocab_dump_path)) 58 | corpus_vocab_dict = corpus.vocab.__dict__ 59 | torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) 60 | 61 | corpus_dict_no_vocab = corpus.__dict__ 62 | corpus_dict_no_vocab.pop('vocab', None) 63 | pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME 64 | print("Save dataset to {}".format(pytorch_dataset_dump_path)) 65 | torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) 66 | 67 | if tf_checkpoint_path: 68 | # Convert a pre-trained TensorFlow model 69 | config_path = os.path.abspath(transfo_xl_config_file) 70 | tf_path = os.path.abspath(tf_checkpoint_path) 71 | 72 | print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path)) 73 | # Initialise PyTorch model 74 | if transfo_xl_config_file == "": 75 | config = TransfoXLConfig() 76 | else: 77 | config = TransfoXLConfig(transfo_xl_config_file) 78 | print("Building PyTorch model from configuration: {}".format(str(config))) 79 | model = TransfoXLLMHeadModel(config) 80 | 81 | model = load_tf_weights_in_transfo_xl(model, config, tf_path) 82 | # Save pytorch-model 83 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) 84 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) 85 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) 86 | torch.save(model.state_dict(), pytorch_weights_dump_path) 87 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) 88 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 89 | f.write(config.to_json_string()) 90 | 91 | 92 | if __name__ == "__main__": 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument("--pytorch_dump_folder_path", 95 | default = None, 96 | type = str, 97 | required = True, 98 | help = "Path to the folder to store the PyTorch model or dataset/vocab.") 99 | parser.add_argument("--tf_checkpoint_path", 100 | default = "", 101 | type = str, 102 | help = "An optional path to a TensorFlow checkpoint path to be converted.") 103 | parser.add_argument("--transfo_xl_config_file", 104 | default = "", 105 | type = str, 106 | help = "An optional config json file corresponding to the pre-trained BERT model. \n" 107 | "This specifies the model architecture.") 108 | parser.add_argument("--transfo_xl_dataset_file", 109 | default = "", 110 | type = str, 111 | help = "An optional dataset file to be converted in a vocabulary.") 112 | args = parser.parse_args() 113 | convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path, 114 | args.transfo_xl_config_file, 115 | args.pytorch_dump_folder_path, 116 | args.transfo_xl_dataset_file) 117 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/optimization_openai.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch optimization for OpenAI GPT model.""" 16 | 17 | import math 18 | import torch 19 | from torch.optim import Optimizer 20 | from torch.optim.optimizer import required 21 | from torch.nn.utils import clip_grad_norm_ 22 | 23 | def warmup_cosine(x, warmup=0.002): 24 | s = 1 if x <= warmup else 0 25 | return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x))) 26 | 27 | def warmup_constant(x, warmup=0.002): 28 | s = 1 if x <= warmup else 0 29 | return s*(x/warmup) + (1-s)*1 30 | 31 | def warmup_linear(x, warmup=0.002): 32 | s = 1 if x <= warmup else 0 33 | return (s*(x/warmup) + (1-s))*(1-x) 34 | 35 | SCHEDULES = { 36 | 'warmup_cosine':warmup_cosine, 37 | 'warmup_constant':warmup_constant, 38 | 'warmup_linear':warmup_linear, 39 | } 40 | 41 | 42 | class OpenAIAdam(Optimizer): 43 | """Implements Open AI version of Adam algorithm with weight decay fix. 44 | """ 45 | def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1, 46 | b1=0.9, b2=0.999, e=1e-8, weight_decay=0, 47 | vector_l2=False, max_grad_norm=-1, **kwargs): 48 | if lr is not required and lr < 0.0: 49 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 50 | if schedule not in SCHEDULES: 51 | raise ValueError("Invalid schedule parameter: {}".format(schedule)) 52 | if not 0.0 <= warmup < 1.0 and not warmup == -1: 53 | raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) 54 | if not 0.0 <= b1 < 1.0: 55 | raise ValueError("Invalid b1 parameter: {}".format(b1)) 56 | if not 0.0 <= b2 < 1.0: 57 | raise ValueError("Invalid b2 parameter: {}".format(b2)) 58 | if not e >= 0.0: 59 | raise ValueError("Invalid epsilon value: {}".format(e)) 60 | defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, 61 | b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2, 62 | max_grad_norm=max_grad_norm) 63 | super(OpenAIAdam, self).__init__(params, defaults) 64 | 65 | def get_lr(self): 66 | lr = [] 67 | for group in self.param_groups: 68 | for p in group['params']: 69 | state = self.state[p] 70 | if len(state) == 0: 71 | return [0] 72 | if group['t_total'] != -1: 73 | schedule_fct = SCHEDULES[group['schedule']] 74 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) 75 | else: 76 | lr_scheduled = group['lr'] 77 | lr.append(lr_scheduled) 78 | return lr 79 | 80 | def step(self, closure=None): 81 | """Performs a single optimization step. 82 | 83 | Arguments: 84 | closure (callable, optional): A closure that reevaluates the model 85 | and returns the loss. 86 | """ 87 | loss = None 88 | if closure is not None: 89 | loss = closure() 90 | 91 | for group in self.param_groups: 92 | for p in group['params']: 93 | if p.grad is None: 94 | continue 95 | grad = p.grad.data 96 | if grad.is_sparse: 97 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 98 | 99 | state = self.state[p] 100 | 101 | # State initialization 102 | if len(state) == 0: 103 | state['step'] = 0 104 | # Exponential moving average of gradient values 105 | state['exp_avg'] = torch.zeros_like(p.data) 106 | # Exponential moving average of squared gradient values 107 | state['exp_avg_sq'] = torch.zeros_like(p.data) 108 | 109 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 110 | beta1, beta2 = group['b1'], group['b2'] 111 | 112 | state['step'] += 1 113 | 114 | # Add grad clipping 115 | if group['max_grad_norm'] > 0: 116 | clip_grad_norm_(p, group['max_grad_norm']) 117 | 118 | # Decay the first and second moment running average coefficient 119 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 120 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 121 | denom = exp_avg_sq.sqrt().add_(group['e']) 122 | 123 | bias_correction1 = 1 - beta1 ** state['step'] 124 | bias_correction2 = 1 - beta2 ** state['step'] 125 | 126 | if group['t_total'] != -1: 127 | schedule_fct = SCHEDULES[group['schedule']] 128 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) 129 | else: 130 | lr_scheduled = group['lr'] 131 | 132 | step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 133 | 134 | p.data.addcdiv_(-step_size, exp_avg, denom) 135 | 136 | # Add weight decay at the end (fixed version) 137 | if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0: 138 | p.data.add_(-lr_scheduled * group['weight_decay'], p.data) 139 | 140 | return loss 141 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch optimization for BERT model.""" 16 | 17 | import math 18 | import torch 19 | from torch.optim import Optimizer 20 | from torch.optim.optimizer import required 21 | from torch.nn.utils import clip_grad_norm_ 22 | 23 | def warmup_cosine(x, warmup=0.002): 24 | if x < warmup: 25 | return x/warmup 26 | return 0.5 * (1.0 + torch.cos(math.pi * x)) 27 | 28 | def warmup_constant(x, warmup=0.002): 29 | if x < warmup: 30 | return x/warmup 31 | return 1.0 32 | 33 | def warmup_linear(x, warmup=0.002): 34 | if x < warmup: 35 | return x/warmup 36 | return 1.0 - x 37 | 38 | SCHEDULES = { 39 | 'warmup_cosine':warmup_cosine, 40 | 'warmup_constant':warmup_constant, 41 | 'warmup_linear':warmup_linear, 42 | } 43 | 44 | 45 | class BertAdam(Optimizer): 46 | """Implements BERT version of Adam algorithm with weight decay fix. 47 | Params: 48 | lr: learning rate 49 | warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 50 | t_total: total number of training steps for the learning 51 | rate schedule, -1 means constant learning rate. Default: -1 52 | schedule: schedule to use for the warmup (see above). Default: 'warmup_linear' 53 | b1: Adams b1. Default: 0.9 54 | b2: Adams b2. Default: 0.999 55 | e: Adams epsilon. Default: 1e-6 56 | weight_decay: Weight decay. Default: 0.01 57 | max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 58 | """ 59 | def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', 60 | b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, 61 | max_grad_norm=1.0): 62 | if lr is not required and lr < 0.0: 63 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 64 | if schedule not in SCHEDULES: 65 | raise ValueError("Invalid schedule parameter: {}".format(schedule)) 66 | if not 0.0 <= warmup < 1.0 and not warmup == -1: 67 | raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) 68 | if not 0.0 <= b1 < 1.0: 69 | raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) 70 | if not 0.0 <= b2 < 1.0: 71 | raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) 72 | if not e >= 0.0: 73 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) 74 | defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, 75 | b1=b1, b2=b2, e=e, weight_decay=weight_decay, 76 | max_grad_norm=max_grad_norm) 77 | super(BertAdam, self).__init__(params, defaults) 78 | 79 | def get_lr(self): 80 | lr = [] 81 | for group in self.param_groups: 82 | for p in group['params']: 83 | state = self.state[p] 84 | if len(state) == 0: 85 | return [0] 86 | if group['t_total'] != -1: 87 | schedule_fct = SCHEDULES[group['schedule']] 88 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) 89 | else: 90 | lr_scheduled = group['lr'] 91 | lr.append(lr_scheduled) 92 | return lr 93 | 94 | def step(self, closure=None): 95 | """Performs a single optimization step. 96 | 97 | Arguments: 98 | closure (callable, optional): A closure that reevaluates the model 99 | and returns the loss. 100 | """ 101 | loss = None 102 | if closure is not None: 103 | loss = closure() 104 | 105 | for group in self.param_groups: 106 | for p in group['params']: 107 | if p.grad is None: 108 | continue 109 | grad = p.grad.data 110 | if grad.is_sparse: 111 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 112 | 113 | state = self.state[p] 114 | 115 | # State initialization 116 | if len(state) == 0: 117 | state['step'] = 0 118 | # Exponential moving average of gradient values 119 | state['next_m'] = torch.zeros_like(p.data) 120 | # Exponential moving average of squared gradient values 121 | state['next_v'] = torch.zeros_like(p.data) 122 | 123 | next_m, next_v = state['next_m'], state['next_v'] 124 | beta1, beta2 = group['b1'], group['b2'] 125 | 126 | # Add grad clipping 127 | if group['max_grad_norm'] > 0: 128 | clip_grad_norm_(p, group['max_grad_norm']) 129 | 130 | # Decay the first and second moment running average coefficient 131 | # In-place operations to update the averages at the same time 132 | next_m.mul_(beta1).add_(1 - beta1, grad) 133 | next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) 134 | update = next_m / (next_v.sqrt() + group['e']) 135 | 136 | # Just adding the square of the weights to the loss function is *not* 137 | # the correct way of using L2 regularization/weight decay with Adam, 138 | # since that will interact with the m and v parameters in strange ways. 139 | # 140 | # Instead we want to decay the weights in a manner that doesn't interact 141 | # with the m/v parameters. This is equivalent to adding the square 142 | # of the weights to the loss with plain (non-momentum) SGD. 143 | if group['weight_decay'] > 0.0: 144 | update += group['weight_decay'] * p.data 145 | 146 | if group['t_total'] != -1: 147 | schedule_fct = SCHEDULES[group['schedule']] 148 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) 149 | else: 150 | lr_scheduled = group['lr'] 151 | 152 | update_with_lr = lr_scheduled * update 153 | p.data.add_(-update_with_lr) 154 | 155 | state['step'] += 1 156 | 157 | # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 158 | # No bias correction 159 | # bias_correction1 = 1 - beta1 ** state['step'] 160 | # bias_correction2 = 1 - beta2 ** state['step'] 161 | 162 | return loss 163 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/tests/modeling_gpt2_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import json 21 | import random 22 | 23 | import torch 24 | 25 | from pytorch_pretrained_bert import (GPT2Config, GPT2Model, 26 | GPT2LMHeadModel, GPT2DoubleHeadsModel) 27 | 28 | 29 | class GPT2ModelTest(unittest.TestCase): 30 | class GPT2ModelTester(object): 31 | 32 | def __init__(self, 33 | parent, 34 | batch_size=13, 35 | seq_length=7, 36 | is_training=True, 37 | use_position_ids=True, 38 | use_token_type_ids=True, 39 | use_labels=True, 40 | vocab_size=99, 41 | n_positions=33, 42 | n_embd=32, 43 | n_layer=5, 44 | n_head=4, 45 | n_choices=3, 46 | type_sequence_label_size=2, 47 | initializer_range=0.02, 48 | num_labels=3, 49 | scope=None): 50 | self.parent = parent 51 | self.batch_size = batch_size 52 | self.seq_length = seq_length 53 | self.is_training = is_training 54 | self.use_position_ids = use_position_ids 55 | self.use_token_type_ids = use_token_type_ids 56 | self.use_labels = use_labels 57 | self.vocab_size = vocab_size 58 | self.n_positions = n_positions 59 | self.n_embd = n_embd 60 | self.n_layer = n_layer 61 | self.n_head = n_head 62 | self.n_choices = n_choices 63 | self.type_sequence_label_size = type_sequence_label_size 64 | self.initializer_range = initializer_range 65 | self.num_labels = num_labels 66 | self.scope = scope 67 | 68 | def prepare_config_and_inputs(self): 69 | input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size) 70 | 71 | position_ids = None 72 | if self.use_position_ids: 73 | position_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions) 74 | 75 | token_type_ids = None 76 | if self.use_token_type_ids: 77 | total_voc = self.vocab_size 78 | token_type_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc) 79 | 80 | mc_labels = None 81 | lm_labels = None 82 | mc_token_ids = None 83 | if self.use_labels: 84 | mc_labels = GPT2ModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size) 85 | lm_labels = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels) 86 | mc_token_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length) 87 | 88 | config = GPT2Config( 89 | vocab_size_or_config_json_file=self.vocab_size, 90 | n_positions=self.n_positions, 91 | n_embd=self.n_embd, 92 | n_layer=self.n_layer, 93 | n_head=self.n_head, 94 | initializer_range=self.initializer_range) 95 | 96 | return (config, input_ids, token_type_ids, position_ids, 97 | mc_labels, lm_labels, mc_token_ids) 98 | 99 | def create_gpt2_model(self, config, input_ids, token_type_ids, position_ids, 100 | mc_labels, lm_labels, mc_token_ids): 101 | model = GPT2Model(config) 102 | model.eval() 103 | hidden_states, presents = model(input_ids, position_ids, token_type_ids) 104 | outputs = { 105 | "hidden_states": hidden_states, 106 | "presents": presents, 107 | } 108 | return outputs 109 | 110 | def check_gpt2_model_output(self, result): 111 | self.parent.assertListEqual( 112 | list(result["hidden_states"].size()), 113 | [self.batch_size, self.n_choices, self.seq_length, self.n_embd]) 114 | 115 | 116 | def create_gpt2_lm_head(self, config, input_ids, token_type_ids, position_ids, 117 | mc_labels, lm_labels, mc_token_ids): 118 | model = GPT2LMHeadModel(config) 119 | model.eval() 120 | loss = model(input_ids, position_ids, token_type_ids, lm_labels) 121 | lm_logits, presents = model(input_ids, position_ids, token_type_ids) 122 | outputs = { 123 | "loss": loss, 124 | "lm_logits": lm_logits, 125 | "presents": presents, 126 | } 127 | return outputs 128 | 129 | def check_gpt2_lm_head_output(self, result): 130 | total_voc = self.vocab_size 131 | self.parent.assertListEqual( 132 | list(result["lm_logits"].size()), 133 | [self.batch_size, self.n_choices, self.seq_length, total_voc]) 134 | 135 | def check_gpt2_lm_head_loss_output(self, result): 136 | self.parent.assertListEqual( 137 | list(result["loss"].size()), 138 | []) 139 | 140 | def create_gpt2_double_heads(self, config, input_ids, token_type_ids, position_ids, 141 | mc_labels, lm_labels, mc_token_ids): 142 | model = GPT2DoubleHeadsModel(config) 143 | model.eval() 144 | loss = model(input_ids, mc_token_ids, 145 | lm_labels=lm_labels, mc_labels=mc_labels, 146 | token_type_ids=token_type_ids, position_ids=position_ids) 147 | lm_logits, mc_logits, presents = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids) 148 | outputs = { 149 | "loss": loss, 150 | "lm_logits": lm_logits, 151 | "mc_logits": mc_logits, 152 | "presents": presents, 153 | } 154 | return outputs 155 | 156 | def check_gpt2_double_heads_output(self, result): 157 | total_voc = self.vocab_size 158 | self.parent.assertListEqual( 159 | list(result["lm_logits"].size()), 160 | [self.batch_size, self.n_choices, self.seq_length, total_voc]) 161 | self.parent.assertListEqual( 162 | list(result["mc_logits"].size()), 163 | [self.batch_size, self.n_choices]) 164 | 165 | def check_gpt2_double_heads_loss_output(self, result): 166 | self.parent.assertListEqual( 167 | [list(l.size()) for l in result["loss"]], 168 | [[], []]) 169 | 170 | def test_default(self): 171 | self.run_tester(GPT2ModelTest.GPT2ModelTester(self)) 172 | 173 | def test_config_to_json_string(self): 174 | config = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37) 175 | obj = json.loads(config.to_json_string()) 176 | self.assertEqual(obj["vocab_size"], 99) 177 | self.assertEqual(obj["n_embd"], 37) 178 | 179 | def run_tester(self, tester): 180 | config_and_inputs = tester.prepare_config_and_inputs() 181 | output_result = tester.create_gpt2_model(*config_and_inputs) 182 | tester.check_gpt2_model_output(output_result) 183 | 184 | output_result = tester.create_gpt2_lm_head(*config_and_inputs) 185 | tester.check_gpt2_lm_head_output(output_result) 186 | tester.check_gpt2_lm_head_loss_output(output_result) 187 | 188 | output_result = tester.create_gpt2_double_heads(*config_and_inputs) 189 | tester.check_gpt2_double_heads_output(output_result) 190 | tester.check_gpt2_double_heads_loss_output(output_result) 191 | 192 | @classmethod 193 | def ids_tensor(cls, shape, vocab_size, rng=None, name=None): 194 | """Creates a random int32 tensor of the shape within the vocab size.""" 195 | if rng is None: 196 | rng = random.Random() 197 | 198 | total_dims = 1 199 | for dim in shape: 200 | total_dims *= dim 201 | 202 | values = [] 203 | for _ in range(total_dims): 204 | values.append(rng.randint(0, vocab_size - 1)) 205 | 206 | return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous() 207 | 208 | 209 | if __name__ == "__main__": 210 | unittest.main() 211 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/file_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for working with the local dataset cache. 3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp 4 | Copyright by the AllenNLP authors. 5 | """ 6 | from __future__ import (absolute_import, division, print_function, unicode_literals) 7 | 8 | import json 9 | import logging 10 | import os 11 | import shutil 12 | import tempfile 13 | from functools import wraps 14 | from hashlib import sha256 15 | import sys 16 | from io import open 17 | 18 | import boto3 19 | import requests 20 | from botocore.exceptions import ClientError 21 | from tqdm import tqdm 22 | 23 | try: 24 | from urllib.parse import urlparse 25 | except ImportError: 26 | from urlparse import urlparse 27 | 28 | try: 29 | from pathlib import Path 30 | PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', 31 | Path.home() / '.pytorch_pretrained_bert')) 32 | except AttributeError: 33 | PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', 34 | os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert')) 35 | 36 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 37 | 38 | 39 | def url_to_filename(url, etag=None): 40 | """ 41 | Convert `url` into a hashed filename in a repeatable way. 42 | If `etag` is specified, append its hash to the url's, delimited 43 | by a period. 44 | """ 45 | url_bytes = url.encode('utf-8') 46 | url_hash = sha256(url_bytes) 47 | filename = url_hash.hexdigest() 48 | 49 | if etag: 50 | etag_bytes = etag.encode('utf-8') 51 | etag_hash = sha256(etag_bytes) 52 | filename += '.' + etag_hash.hexdigest() 53 | 54 | return filename 55 | 56 | 57 | def filename_to_url(filename, cache_dir=None): 58 | """ 59 | Return the url and etag (which may be ``None``) stored for `filename`. 60 | Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. 61 | """ 62 | if cache_dir is None: 63 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 64 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 65 | cache_dir = str(cache_dir) 66 | 67 | cache_path = os.path.join(cache_dir, filename) 68 | if not os.path.exists(cache_path): 69 | raise EnvironmentError("file {} not found".format(cache_path)) 70 | 71 | meta_path = cache_path + '.json' 72 | if not os.path.exists(meta_path): 73 | raise EnvironmentError("file {} not found".format(meta_path)) 74 | 75 | with open(meta_path, encoding="utf-8") as meta_file: 76 | metadata = json.load(meta_file) 77 | url = metadata['url'] 78 | etag = metadata['etag'] 79 | 80 | return url, etag 81 | 82 | 83 | def cached_path(url_or_filename, cache_dir=None): 84 | """ 85 | Given something that might be a URL (or might be a local path), 86 | determine which. If it's a URL, download the file and cache it, and 87 | return the path to the cached file. If it's already a local path, 88 | make sure the file exists and then return the path. 89 | """ 90 | if cache_dir is None: 91 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 92 | if sys.version_info[0] == 3 and isinstance(url_or_filename, Path): 93 | url_or_filename = str(url_or_filename) 94 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 95 | cache_dir = str(cache_dir) 96 | 97 | parsed = urlparse(url_or_filename) 98 | 99 | if parsed.scheme in ('http', 'https', 's3'): 100 | # URL, so get it from the cache (downloading if necessary) 101 | return get_from_cache(url_or_filename, cache_dir) 102 | elif os.path.exists(url_or_filename): 103 | # File, and it exists. 104 | return url_or_filename 105 | elif parsed.scheme == '': 106 | # File, but it doesn't exist. 107 | raise EnvironmentError("file {} not found".format(url_or_filename)) 108 | else: 109 | # Something unknown 110 | raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) 111 | 112 | 113 | def split_s3_path(url): 114 | """Split a full s3 path into the bucket name and path.""" 115 | parsed = urlparse(url) 116 | if not parsed.netloc or not parsed.path: 117 | raise ValueError("bad s3 path {}".format(url)) 118 | bucket_name = parsed.netloc 119 | s3_path = parsed.path 120 | # Remove '/' at beginning of path. 121 | if s3_path.startswith("/"): 122 | s3_path = s3_path[1:] 123 | return bucket_name, s3_path 124 | 125 | 126 | def s3_request(func): 127 | """ 128 | Wrapper function for s3 requests in order to create more helpful error 129 | messages. 130 | """ 131 | 132 | @wraps(func) 133 | def wrapper(url, *args, **kwargs): 134 | try: 135 | return func(url, *args, **kwargs) 136 | except ClientError as exc: 137 | if int(exc.response["Error"]["Code"]) == 404: 138 | raise EnvironmentError("file {} not found".format(url)) 139 | else: 140 | raise 141 | 142 | return wrapper 143 | 144 | 145 | @s3_request 146 | def s3_etag(url): 147 | """Check ETag on S3 object.""" 148 | s3_resource = boto3.resource("s3") 149 | bucket_name, s3_path = split_s3_path(url) 150 | s3_object = s3_resource.Object(bucket_name, s3_path) 151 | return s3_object.e_tag 152 | 153 | 154 | @s3_request 155 | def s3_get(url, temp_file): 156 | """Pull a file directly from S3.""" 157 | s3_resource = boto3.resource("s3") 158 | bucket_name, s3_path = split_s3_path(url) 159 | s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) 160 | 161 | 162 | def http_get(url, temp_file): 163 | req = requests.get(url, stream=True) 164 | content_length = req.headers.get('Content-Length') 165 | total = int(content_length) if content_length is not None else None 166 | progress = tqdm(unit="B", total=total) 167 | for chunk in req.iter_content(chunk_size=1024): 168 | if chunk: # filter out keep-alive new chunks 169 | progress.update(len(chunk)) 170 | temp_file.write(chunk) 171 | progress.close() 172 | 173 | 174 | def get_from_cache(url, cache_dir=None): 175 | """ 176 | Given a URL, look for the corresponding dataset in the local cache. 177 | If it's not there, download it. Then return the path to the cached file. 178 | """ 179 | if cache_dir is None: 180 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 181 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 182 | cache_dir = str(cache_dir) 183 | 184 | if not os.path.exists(cache_dir): 185 | os.makedirs(cache_dir) 186 | 187 | # Get eTag to add to filename, if it exists. 188 | if url.startswith("s3://"): 189 | etag = s3_etag(url) 190 | else: 191 | response = requests.head(url, allow_redirects=True) 192 | if response.status_code != 200: 193 | raise IOError("HEAD request failed for url {} with status code {}" 194 | .format(url, response.status_code)) 195 | etag = response.headers.get("ETag") 196 | 197 | filename = url_to_filename(url, etag) 198 | 199 | # get cache path to put the file 200 | cache_path = os.path.join(cache_dir, filename) 201 | 202 | if not os.path.exists(cache_path): 203 | # Download to temporary file, then copy to cache dir once finished. 204 | # Otherwise you get corrupt cache entries if the download gets interrupted. 205 | with tempfile.NamedTemporaryFile() as temp_file: 206 | logger.info("%s not found in cache, downloading to %s", url, temp_file.name) 207 | 208 | # GET file object 209 | if url.startswith("s3://"): 210 | s3_get(url, temp_file) 211 | else: 212 | http_get(url, temp_file) 213 | 214 | # we are copying the file before closing it, so flush to avoid truncation 215 | temp_file.flush() 216 | # shutil.copyfileobj() starts at the current position, so go to the start 217 | temp_file.seek(0) 218 | 219 | logger.info("copying %s to cache at %s", temp_file.name, cache_path) 220 | with open(cache_path, 'wb') as cache_file: 221 | shutil.copyfileobj(temp_file, cache_file) 222 | 223 | logger.info("creating metadata file for %s", cache_path) 224 | meta = {'url': url, 'etag': etag} 225 | meta_path = cache_path + '.json' 226 | with open(meta_path, 'w', encoding="utf-8") as meta_file: 227 | json.dump(meta, meta_file) 228 | 229 | logger.info("removing temp file %s", temp_file.name) 230 | 231 | return cache_path 232 | 233 | 234 | def read_set_from_file(filename): 235 | ''' 236 | Extract a de-duped collection (set) of text from a file. 237 | Expected file format is one item per line. 238 | ''' 239 | collection = set() 240 | with open(filename, 'r', encoding='utf-8') as file_: 241 | for line in file_: 242 | collection.add(line.rstrip()) 243 | return collection 244 | 245 | 246 | def get_file_extension(path, dot=True, lower=True): 247 | ext = os.path.splitext(path)[1] 248 | ext = ext if dot else ext[1:] 249 | return ext.lower() if lower else ext 250 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/tokenization_gpt2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for OpenAI GPT.""" 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import json 20 | import logging 21 | import os 22 | import regex as re 23 | from io import open 24 | 25 | try: 26 | from functools import lru_cache 27 | except ImportError: 28 | # Just a dummy decorator to get the checks to run on python2 29 | # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now. 30 | def lru_cache(): 31 | return lambda func: func 32 | 33 | from .file_utils import cached_path 34 | 35 | logger = logging.getLogger(__name__) 36 | 37 | PRETRAINED_VOCAB_ARCHIVE_MAP = { 38 | 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", 39 | } 40 | PRETRAINED_MERGES_ARCHIVE_MAP = { 41 | 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", 42 | } 43 | PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { 44 | 'gpt2': 1024, 45 | } 46 | VOCAB_NAME = 'vocab.json' 47 | MERGES_NAME = 'merges.txt' 48 | 49 | @lru_cache() 50 | def bytes_to_unicode(): 51 | """ 52 | Returns list of utf-8 byte and a corresponding list of unicode strings. 53 | The reversible bpe codes work on unicode strings. 54 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 55 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 56 | This is a signficant percentage of your normal, say, 32K bpe vocab. 57 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 58 | And avoids mapping to whitespace/control characters the bpe code barfs on. 59 | """ 60 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 61 | cs = bs[:] 62 | n = 0 63 | for b in range(2**8): 64 | if b not in bs: 65 | bs.append(b) 66 | cs.append(2**8+n) 67 | n += 1 68 | cs = [chr(n) for n in cs] 69 | return dict(zip(bs, cs)) 70 | 71 | def get_pairs(word): 72 | """Return set of symbol pairs in a word. 73 | 74 | Word is represented as tuple of symbols (symbols being variable-length strings). 75 | """ 76 | pairs = set() 77 | prev_char = word[0] 78 | for char in word[1:]: 79 | pairs.add((prev_char, char)) 80 | prev_char = char 81 | return pairs 82 | 83 | class GPT2Tokenizer(object): 84 | """ 85 | GPT-2 BPE tokenizer. Peculiarities: 86 | - Byte-level BPE 87 | """ 88 | @classmethod 89 | def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): 90 | """ 91 | Instantiate a PreTrainedBertModel from a pre-trained model file. 92 | Download and cache the pre-trained model file if needed. 93 | """ 94 | if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: 95 | vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] 96 | merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path] 97 | else: 98 | vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) 99 | merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) 100 | # redirect to the cache, if necessary 101 | try: 102 | resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) 103 | resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir) 104 | except EnvironmentError: 105 | logger.error( 106 | "Model name '{}' was not found in model name list ({}). " 107 | "We assumed '{}' was a path or url but couldn't find files {} and {} " 108 | "at this path or url.".format( 109 | pretrained_model_name_or_path, 110 | ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), 111 | pretrained_model_name_or_path, 112 | vocab_file, merges_file)) 113 | return None 114 | if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file: 115 | logger.info("loading vocabulary file {}".format(vocab_file)) 116 | logger.info("loading merges file {}".format(merges_file)) 117 | else: 118 | logger.info("loading vocabulary file {} from cache at {}".format( 119 | vocab_file, resolved_vocab_file)) 120 | logger.info("loading merges file {} from cache at {}".format( 121 | merges_file, resolved_merges_file)) 122 | if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: 123 | # if we're using a pretrained model, ensure the tokenizer wont index sequences longer 124 | # than the number of positional embeddings 125 | max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] 126 | kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) 127 | # Instantiate tokenizer. 128 | tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs) 129 | return tokenizer 130 | 131 | def __init__(self, vocab_file, merges_file, errors='replace', max_len=None): 132 | self.max_len = max_len if max_len is not None else int(1e12) 133 | self.encoder = json.load(open(vocab_file)) 134 | self.decoder = {v:k for k,v in self.encoder.items()} 135 | self.errors = errors # how to handle errors in decoding 136 | self.byte_encoder = bytes_to_unicode() 137 | self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} 138 | bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] 139 | bpe_merges = [tuple(merge.split()) for merge in bpe_data] 140 | self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) 141 | self.cache = {} 142 | 143 | # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions 144 | self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") 145 | 146 | def __len__(self): 147 | return len(self.encoder) 148 | 149 | def bpe(self, token): 150 | if token in self.cache: 151 | return self.cache[token] 152 | word = tuple(token) 153 | pairs = get_pairs(word) 154 | 155 | if not pairs: 156 | return token 157 | 158 | while True: 159 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 160 | if bigram not in self.bpe_ranks: 161 | break 162 | first, second = bigram 163 | new_word = [] 164 | i = 0 165 | while i < len(word): 166 | try: 167 | j = word.index(first, i) 168 | new_word.extend(word[i:j]) 169 | i = j 170 | except: 171 | new_word.extend(word[i:]) 172 | break 173 | 174 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 175 | new_word.append(first+second) 176 | i += 2 177 | else: 178 | new_word.append(word[i]) 179 | i += 1 180 | new_word = tuple(new_word) 181 | word = new_word 182 | if len(word) == 1: 183 | break 184 | else: 185 | pairs = get_pairs(word) 186 | word = ' '.join(word) 187 | self.cache[token] = word 188 | return word 189 | 190 | def encode(self, text): 191 | bpe_tokens = [] 192 | for token in re.findall(self.pat, text): 193 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 194 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 195 | if len(bpe_tokens) > self.max_len: 196 | raise ValueError( 197 | "Token indices sequence length is longer than the specified maximum " 198 | " sequence length for this OpenAI GPT-2 model ({} > {}). Running this" 199 | " sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len) 200 | ) 201 | return bpe_tokens 202 | 203 | def decode(self, tokens): 204 | text = ''.join([self.decoder[token] for token in tokens]) 205 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) 206 | return text 207 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/tests/modeling_transfo_xl_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import json 21 | import random 22 | 23 | import torch 24 | 25 | from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel) 26 | 27 | 28 | class TransfoXLModelTest(unittest.TestCase): 29 | class TransfoXLModelTester(object): 30 | 31 | def __init__(self, 32 | parent, 33 | batch_size=13, 34 | seq_length=7, 35 | mem_len=30, 36 | clamp_len=15, 37 | is_training=True, 38 | use_labels=True, 39 | vocab_size=99, 40 | cutoffs=[10, 50, 80], 41 | d_model=32, 42 | d_embed=32, 43 | n_head=4, 44 | d_head=8, 45 | d_inner=128, 46 | div_val=2, 47 | n_layer=5, 48 | scope=None, 49 | seed=1): 50 | self.parent = parent 51 | self.batch_size = batch_size 52 | self.seq_length = seq_length 53 | self.mem_len = mem_len 54 | self.clamp_len = clamp_len 55 | self.is_training = is_training 56 | self.use_labels = use_labels 57 | self.vocab_size = vocab_size 58 | self.cutoffs = cutoffs 59 | self.d_model = d_model 60 | self.d_embed = d_embed 61 | self.n_head = n_head 62 | self.d_head = d_head 63 | self.d_inner = d_inner 64 | self.div_val = div_val 65 | self.n_layer = n_layer 66 | self.scope = scope 67 | self.seed = seed 68 | 69 | def prepare_config_and_inputs(self): 70 | input_ids_1 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size) 71 | input_ids_2 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size) 72 | 73 | lm_labels = None 74 | if self.use_labels: 75 | lm_labels = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size) 76 | 77 | config = TransfoXLConfig( 78 | vocab_size_or_config_json_file=self.vocab_size, 79 | mem_len=self.mem_len, 80 | clamp_len=self.clamp_len, 81 | cutoffs=self.cutoffs, 82 | d_model=self.d_model, 83 | d_embed=self.d_embed, 84 | n_head=self.n_head, 85 | d_head=self.d_head, 86 | d_inner=self.d_inner, 87 | div_val=self.div_val, 88 | n_layer=self.n_layer) 89 | 90 | return (config, input_ids_1, input_ids_2, lm_labels) 91 | 92 | def set_seed(self): 93 | random.seed(self.seed) 94 | torch.manual_seed(self.seed) 95 | 96 | def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels): 97 | model = TransfoXLModel(config) 98 | model.eval() 99 | 100 | hidden_states_1, mems_1 = model(input_ids_1) 101 | hidden_states_2, mems_2 = model(input_ids_2, mems_1) 102 | outputs = { 103 | "hidden_states_1": hidden_states_1, 104 | "mems_1": mems_1, 105 | "hidden_states_2": hidden_states_2, 106 | "mems_2": mems_2, 107 | } 108 | return outputs 109 | 110 | def check_transfo_xl_model_output(self, result): 111 | self.parent.assertListEqual( 112 | list(result["hidden_states_1"].size()), 113 | [self.batch_size, self.seq_length, self.d_model]) 114 | self.parent.assertListEqual( 115 | list(result["hidden_states_2"].size()), 116 | [self.batch_size, self.seq_length, self.d_model]) 117 | self.parent.assertListEqual( 118 | list(list(mem.size()) for mem in result["mems_1"]), 119 | [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer) 120 | self.parent.assertListEqual( 121 | list(list(mem.size()) for mem in result["mems_2"]), 122 | [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer) 123 | 124 | 125 | def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels): 126 | model = TransfoXLLMHeadModel(config) 127 | model.eval() 128 | 129 | loss_1, mems_1a = model(input_ids_1, target=lm_labels) 130 | lm_logits_1, mems_1b = model(input_ids_1) 131 | 132 | loss_2, mems_2a = model(input_ids_2, target=lm_labels, mems=mems_1a) 133 | lm_logits_2, mems_2b = model(input_ids_2, mems=mems_1b) 134 | 135 | outputs = { 136 | "loss_1": loss_1, 137 | "mems_1a": mems_1a, 138 | "lm_logits_1": lm_logits_1, 139 | "mems_1b": mems_1b, 140 | "loss_2": loss_2, 141 | "mems_2a": mems_2a, 142 | "lm_logits_2": lm_logits_2, 143 | "mems_2b": mems_2b, 144 | } 145 | return outputs 146 | 147 | def check_transfo_xl_lm_head_output(self, result): 148 | self.parent.assertListEqual( 149 | list(result["loss_1"].size()), 150 | [self.batch_size, self.seq_length]) 151 | self.parent.assertListEqual( 152 | list(result["lm_logits_1"].size()), 153 | [self.batch_size, self.seq_length, self.vocab_size]) 154 | self.parent.assertListEqual( 155 | list(list(mem.size()) for mem in result["mems_1a"]), 156 | [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer) 157 | self.parent.assertListEqual( 158 | list(list(mem.size()) for mem in result["mems_1b"]), 159 | [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer) 160 | self.parent.assertListEqual( 161 | list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1a"]), 162 | list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1b"])) 163 | 164 | self.parent.assertListEqual( 165 | list(result["loss_2"].size()), 166 | [self.batch_size, self.seq_length]) 167 | self.parent.assertListEqual( 168 | list(result["lm_logits_2"].size()), 169 | [self.batch_size, self.seq_length, self.vocab_size]) 170 | self.parent.assertListEqual( 171 | list(list(mem.size()) for mem in result["mems_2a"]), 172 | [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer) 173 | self.parent.assertListEqual( 174 | list(list(mem.size()) for mem in result["mems_2b"]), 175 | [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer) 176 | self.parent.assertListEqual( 177 | list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2a"]), 178 | list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2b"])) 179 | 180 | def test_default(self): 181 | self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self)) 182 | 183 | def test_config_to_json_string(self): 184 | config = TransfoXLConfig(vocab_size_or_config_json_file=96, d_embed=37) 185 | obj = json.loads(config.to_json_string()) 186 | self.assertEqual(obj["n_token"], 96) 187 | self.assertEqual(obj["d_embed"], 37) 188 | 189 | def run_tester(self, tester): 190 | config_and_inputs = tester.prepare_config_and_inputs() 191 | 192 | tester.set_seed() 193 | output_result = tester.create_transfo_xl_model(*config_and_inputs) 194 | tester.check_transfo_xl_model_output(output_result) 195 | 196 | tester.set_seed() 197 | output_result = tester.create_transfo_xl_lm_head(*config_and_inputs) 198 | tester.check_transfo_xl_lm_head_output(output_result) 199 | 200 | @classmethod 201 | def ids_tensor(cls, shape, vocab_size, rng=None, name=None): 202 | """Creates a random int32 tensor of the shape within the vocab size.""" 203 | if rng is None: 204 | rng = random.Random() 205 | 206 | total_dims = 1 207 | for dim in shape: 208 | total_dims *= dim 209 | 210 | values = [] 211 | for _ in range(total_dims): 212 | values.append(rng.randint(0, vocab_size - 1)) 213 | 214 | return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous() 215 | 216 | 217 | if __name__ == "__main__": 218 | unittest.main() 219 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/tests/modeling_openai_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import json 21 | import random 22 | 23 | import torch 24 | 25 | from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel, 26 | OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) 27 | 28 | 29 | class OpenAIGPTModelTest(unittest.TestCase): 30 | class OpenAIGPTModelTester(object): 31 | 32 | def __init__(self, 33 | parent, 34 | batch_size=13, 35 | seq_length=7, 36 | is_training=True, 37 | use_position_ids=True, 38 | use_token_type_ids=True, 39 | use_labels=True, 40 | vocab_size=99, 41 | n_special=1, 42 | n_positions=33, 43 | n_embd=32, 44 | n_layer=5, 45 | n_head=4, 46 | n_choices=3, 47 | afn="gelu", 48 | resid_pdrop=0.1, 49 | attn_pdrop=0.1, 50 | embd_pdrop=0.1, 51 | type_sequence_label_size=2, 52 | initializer_range=0.02, 53 | num_labels=3, 54 | scope=None): 55 | self.parent = parent 56 | self.batch_size = batch_size 57 | self.seq_length = seq_length 58 | self.is_training = is_training 59 | self.use_position_ids = use_position_ids 60 | self.use_token_type_ids = use_token_type_ids 61 | self.use_labels = use_labels 62 | self.vocab_size = vocab_size 63 | self.n_special = n_special 64 | self.n_positions = n_positions 65 | self.n_embd = n_embd 66 | self.n_layer = n_layer 67 | self.n_head = n_head 68 | self.afn = afn 69 | self.n_choices = n_choices 70 | self.resid_pdrop = resid_pdrop 71 | self.attn_pdrop = attn_pdrop 72 | self.embd_pdrop = embd_pdrop 73 | self.type_sequence_label_size = type_sequence_label_size 74 | self.initializer_range = initializer_range 75 | self.num_labels = num_labels 76 | self.scope = scope 77 | 78 | def prepare_config_and_inputs(self): 79 | input_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size) 80 | 81 | position_ids = None 82 | if self.use_position_ids: 83 | position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions) 84 | 85 | token_type_ids = None 86 | if self.use_token_type_ids: 87 | total_voc = self.vocab_size + self.n_special 88 | token_type_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc) 89 | 90 | mc_labels = None 91 | lm_labels = None 92 | mc_token_ids = None 93 | if self.use_labels: 94 | mc_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size) 95 | lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels) 96 | mc_token_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length) 97 | 98 | config = OpenAIGPTConfig( 99 | vocab_size_or_config_json_file=self.vocab_size, 100 | n_positions=self.n_positions, 101 | n_special=self.n_special, 102 | n_embd=self.n_embd, 103 | n_layer=self.n_layer, 104 | n_head=self.n_head, 105 | afn=self.afn, 106 | resid_pdrop=self.resid_pdrop, 107 | attn_pdrop=self.attn_pdrop, 108 | embd_pdrop=self.embd_pdrop, 109 | initializer_range=self.initializer_range) 110 | 111 | return (config, input_ids, token_type_ids, position_ids, 112 | mc_labels, lm_labels, mc_token_ids) 113 | 114 | def create_openai_model(self, config, input_ids, token_type_ids, position_ids, 115 | mc_labels, lm_labels, mc_token_ids): 116 | model = OpenAIGPTModel(config) 117 | model.eval() 118 | hidden_states = model(input_ids, position_ids, token_type_ids) 119 | outputs = { 120 | "hidden_states": hidden_states, 121 | } 122 | return outputs 123 | 124 | def check_openai_model_output(self, result): 125 | self.parent.assertListEqual( 126 | list(result["hidden_states"].size()), 127 | [self.batch_size, self.n_choices, self.seq_length, self.n_embd]) 128 | 129 | 130 | def create_openai_lm_head(self, config, input_ids, token_type_ids, position_ids, 131 | mc_labels, lm_labels, mc_token_ids): 132 | model = OpenAIGPTLMHeadModel(config) 133 | model.eval() 134 | loss = model(input_ids, position_ids, token_type_ids, lm_labels) 135 | lm_logits = model(input_ids, position_ids, token_type_ids) 136 | outputs = { 137 | "loss": loss, 138 | "lm_logits": lm_logits, 139 | } 140 | return outputs 141 | 142 | def check_openai_lm_head_output(self, result): 143 | total_voc = self.n_special + self.vocab_size 144 | self.parent.assertListEqual( 145 | list(result["lm_logits"].size()), 146 | [self.batch_size, self.n_choices, self.seq_length, total_voc]) 147 | 148 | def check_openai_lm_head_loss_output(self, result): 149 | self.parent.assertListEqual( 150 | list(result["loss"].size()), 151 | []) 152 | 153 | def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids, 154 | mc_labels, lm_labels, mc_token_ids): 155 | model = OpenAIGPTDoubleHeadsModel(config) 156 | model.eval() 157 | loss = model(input_ids, mc_token_ids, 158 | lm_labels=lm_labels, mc_labels=mc_labels, 159 | token_type_ids=token_type_ids, position_ids=position_ids) 160 | lm_logits, mc_logits = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids) 161 | outputs = { 162 | "loss": loss, 163 | "lm_logits": lm_logits, 164 | "mc_logits": mc_logits, 165 | } 166 | return outputs 167 | 168 | def check_openai_double_heads_output(self, result): 169 | total_voc = self.n_special + self.vocab_size 170 | self.parent.assertListEqual( 171 | list(result["lm_logits"].size()), 172 | [self.batch_size, self.n_choices, self.seq_length, total_voc]) 173 | self.parent.assertListEqual( 174 | list(result["mc_logits"].size()), 175 | [self.batch_size, self.n_choices]) 176 | 177 | def check_openai_double_heads_loss_output(self, result): 178 | self.parent.assertListEqual( 179 | [list(l.size()) for l in result["loss"]], 180 | [[], []]) 181 | 182 | def test_default(self): 183 | self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self)) 184 | 185 | def test_config_to_json_string(self): 186 | config = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37) 187 | obj = json.loads(config.to_json_string()) 188 | self.assertEqual(obj["vocab_size"], 99) 189 | self.assertEqual(obj["n_embd"], 37) 190 | 191 | def run_tester(self, tester): 192 | config_and_inputs = tester.prepare_config_and_inputs() 193 | output_result = tester.create_openai_model(*config_and_inputs) 194 | tester.check_openai_model_output(output_result) 195 | 196 | output_result = tester.create_openai_lm_head(*config_and_inputs) 197 | tester.check_openai_lm_head_output(output_result) 198 | tester.check_openai_lm_head_loss_output(output_result) 199 | 200 | output_result = tester.create_openai_double_heads(*config_and_inputs) 201 | tester.check_openai_double_heads_output(output_result) 202 | tester.check_openai_double_heads_loss_output(output_result) 203 | 204 | @classmethod 205 | def ids_tensor(cls, shape, vocab_size, rng=None, name=None): 206 | """Creates a random int32 tensor of the shape within the vocab size.""" 207 | if rng is None: 208 | rng = random.Random() 209 | 210 | total_dims = 1 211 | for dim in shape: 212 | total_dims *= dim 213 | 214 | values = [] 215 | for _ in range(total_dims): 216 | values.append(rng.randint(0, vocab_size - 1)) 217 | 218 | return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous() 219 | 220 | 221 | if __name__ == "__main__": 222 | unittest.main() 223 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/examples/run_classifier_class.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | import argparse 3 | import csv 4 | import logging 5 | import time 6 | import os 7 | import random 8 | import sys 9 | import re 10 | import json 11 | import numpy as np 12 | import torch 13 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 14 | TensorDataset) 15 | from torch.utils.data.distributed import DistributedSampler 16 | from tqdm import tqdm, trange 17 | import torch.nn.functional as F 18 | 19 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE 20 | from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME 21 | from pytorch_pretrained_bert.tokenization import BertTokenizer 22 | from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear 23 | 24 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 25 | datefmt='%m/%d/%Y %H:%M:%S', 26 | level=logging.INFO) 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | class InputExample(object): 31 | """A single training/test example for simple sequence classification.""" 32 | 33 | def __init__(self, guid, text_a, text_b=None, label=None): 34 | """Constructs a InputExample. 35 | 36 | Args: 37 | guid: Unique id for the example. 38 | text_a: string. The untokenized text of the first sequence. For single 39 | sequence tasks, only this sequence must be specified. 40 | text_b: (Optional) string. The untokenized text of the second sequence. 41 | Only must be specified for sequence pair tasks. 42 | label: (Optional) string. The label of the example. This should be 43 | specified for train and dev examples, but not for test examples. 44 | """ 45 | self.guid = guid 46 | self.text_a = text_a 47 | self.text_b = text_b 48 | self.label = label 49 | 50 | 51 | class InputFeatures(object): 52 | """A single set of features of data.""" 53 | 54 | def __init__(self, input_ids, input_mask, segment_ids, label_id): 55 | self.input_ids = input_ids 56 | self.input_mask = input_mask 57 | self.segment_ids = segment_ids 58 | self.label_id = label_id 59 | 60 | 61 | class DataProcessor(object): 62 | """Base class for data converters for sequence classification data sets.""" 63 | 64 | def get_train_examples(self, data_dir): 65 | """Gets a collection of `InputExample`s for the train set.""" 66 | raise NotImplementedError() 67 | 68 | def get_dev_examples(self, data_dir): 69 | """Gets a collection of `InputExample`s for the dev set.""" 70 | raise NotImplementedError() 71 | 72 | def get_labels(self): 73 | """Gets the list of labels for this data set.""" 74 | raise NotImplementedError() 75 | 76 | def _read_txt(cls, line, quotechar=None): 77 | """Reads a tab separated value file.""" 78 | lines = [] 79 | line = line.strip() 80 | label = line[-1] 81 | text_1 = line[:-1].strip().split('#')[0] 82 | text_2 = line[:-1].strip().split('#')[1] 83 | ll_line = [text_1,text_2,label] 84 | lines.append(ll_line) 85 | return lines 86 | 87 | 88 | class SimProcessor(DataProcessor): 89 | 90 | def get_dev_examples(self, line): 91 | 92 | return self._create_examples(self._read_txt(line), "dev") 93 | #序号、sen1、sen2、类别 94 | 95 | 96 | def _create_examples(self, lines, set_type): 97 | """Creates examples for the training and dev sets.""" 98 | examples = [] 99 | for (i, line) in enumerate(lines): 100 | guid = "%s-%s" % (set_type, i) 101 | text_a = line[0] 102 | text_b = line[1] 103 | label = line[2] 104 | examples.append( 105 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 106 | return examples 107 | 108 | 109 | #返回所有的类别 110 | def get_labels(self): 111 | """See base class.""" 112 | return ["0", "1"] 113 | 114 | 115 | def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): 116 | """Loads a data file into a list of `InputBatch`s.""" 117 | 118 | label_map = {label: i for i, label in enumerate(label_list)} 119 | features = [] 120 | for (ex_index, example) in enumerate(examples): 121 | tokens_a = tokenizer.tokenize(example.text_a) 122 | tokens_b = None 123 | if example.text_b: 124 | tokens_b = tokenizer.tokenize(example.text_b) 125 | _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) 126 | else: 127 | if len(tokens_a) > max_seq_length - 2: 128 | tokens_a = tokens_a[:(max_seq_length - 2)] 129 | tokens = ["[CLS]"] + tokens_a + ["[SEP]"] 130 | segment_ids = [0] * len(tokens) 131 | if tokens_b: 132 | tokens += tokens_b + ["[SEP]"] 133 | segment_ids += [1] * (len(tokens_b) + 1) 134 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 135 | input_mask = [1] * len(input_ids) 136 | padding = [0] * (max_seq_length - len(input_ids)) 137 | input_ids += padding 138 | input_mask += padding 139 | segment_ids += padding 140 | 141 | assert len(input_ids) == max_seq_length 142 | assert len(input_mask) == max_seq_length 143 | assert len(segment_ids) == max_seq_length 144 | 145 | label_id = label_map[example.label] 146 | if ex_index < 5: 147 | logger.info("*** Example ***") 148 | logger.info("guid: %s" % (example.guid)) 149 | logger.info("tokens: %s" % " ".join( 150 | [str(x) for x in tokens])) 151 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 152 | logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 153 | logger.info( 154 | "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 155 | logger.info("label: %s (id = %d)" % (example.label, label_id)) 156 | 157 | features.append( 158 | InputFeatures(input_ids=input_ids, 159 | input_mask=input_mask, 160 | segment_ids=segment_ids, 161 | label_id=label_id)) 162 | return features 163 | 164 | 165 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 166 | """Truncates a sequence pair in place to the maximum length.""" 167 | 168 | # This is a simple heuristic which will always truncate the longer sequence 169 | # one token at a time. This makes more sense than truncating an equal percent 170 | # of tokens from each, since if one sequence is very short then each token 171 | # that's truncated likely contains more information than a longer sequence. 172 | while True: 173 | total_length = len(tokens_a) + len(tokens_b) 174 | if total_length <= max_length: 175 | break 176 | if len(tokens_a) > len(tokens_b): 177 | tokens_a.pop() 178 | else: 179 | tokens_b.pop() 180 | 181 | 182 | def accuracy(out, labels): 183 | outputs = np.argmax(out, axis=1) 184 | return np.sum(outputs == labels) 185 | 186 | 187 | class Predict: 188 | def __init__(self): 189 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 190 | c_bert_model = "./tmp_chinese/mrpc_output/" 191 | raw_bert_model = "./models/chinese_L-12_H-768_A-12" 192 | num_labels = 2 193 | self.tokenizer = BertTokenizer.from_pretrained(raw_bert_model) 194 | self.model = BertForSequenceClassification.from_pretrained(c_bert_model, num_labels=num_labels) 195 | self.model.to(device) 196 | 197 | def predict(self, line): 198 | processors = { 199 | # "cola": ColaProcessor, 200 | # "mnli": MnliProcessor, 201 | "mrpc": SimProcessor 202 | } 203 | 204 | num_labels_task = { 205 | "mrpc": 2, 206 | } 207 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 208 | 209 | 210 | processor = processors['mrpc']() 211 | label_list = processor.get_labels() 212 | max_seq_length = 128 213 | eval_batch_size = 8 214 | 215 | # tokenizer = BertTokenizer.from_pretrained(raw_bert_model) 216 | # model = BertForSequenceClassification.from_pretrained(c_bert_model, num_labels=num_labels) 217 | # model.to(device) 218 | 219 | test_line = line + "\t1" 220 | 221 | 222 | eval_examples = processor.get_dev_examples(test_line) 223 | eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, self.tokenizer) 224 | 225 | all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) 226 | all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) 227 | all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) 228 | all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) 229 | eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) 230 | # Run prediction for full data 231 | eval_sampler = SequentialSampler(eval_data) 232 | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) 233 | 234 | self.model.eval() 235 | 236 | for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): 237 | input_ids = input_ids.to(device) 238 | input_mask = input_mask.to(device) 239 | segment_ids = segment_ids.to(device) 240 | label_ids = label_ids.to(device) 241 | 242 | with torch.no_grad(): 243 | tmp_eval_loss = self.model(input_ids, segment_ids, input_mask, label_ids) 244 | logits = self.model(input_ids, segment_ids, input_mask) 245 | score = F.softmax(logits, 1) 246 | maximum_probability = score.detach().cpu().numpy()[0].max() 247 | print(maximum_probability) 248 | logits = logits.detach().cpu().numpy()[0] 249 | res = np.argmax(logits) 250 | # return res 251 | 252 | id2Senti = { 253 | "0":'不同', 254 | "1":'相同', 255 | } 256 | 257 | result = { 258 | 'content': line, 259 | 'result': id2Senti[str(res)], 260 | 'probability': str(round(100*maximum_probability,2))+'%' 261 | } 262 | # return result 263 | return json.dumps(result, ensure_ascii=False) 264 | # label_ids = label_ids.to('cpu').numpy() 265 | 266 | 267 | if __name__ == "__main__": 268 | p = Predict() 269 | print(p.predict("你多大了?#你的年龄是多少?")) 270 | #input_file = './chinese_data/data_dev.txt' 271 | #sequence = read_txt(input_file) 272 | #print(time.strftime("%H:%M:%S")) 273 | #for i in range(len(sequence)): 274 | #print(p.predict(sequence[i])) 275 | #print(time.strftime("%H:%M:%S")) 276 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/tokenization_openai.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for OpenAI GPT.""" 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import json 20 | import logging 21 | import os 22 | import re 23 | import sys 24 | from io import open 25 | 26 | from tqdm import tqdm 27 | 28 | from .file_utils import cached_path 29 | from .tokenization import BasicTokenizer 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | PRETRAINED_VOCAB_ARCHIVE_MAP = { 34 | 'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json", 35 | } 36 | PRETRAINED_MERGES_ARCHIVE_MAP = { 37 | 'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt", 38 | } 39 | PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { 40 | 'openai-gpt': 512, 41 | } 42 | VOCAB_NAME = 'vocab.json' 43 | MERGES_NAME = 'merges.txt' 44 | 45 | def get_pairs(word): 46 | """ 47 | Return set of symbol pairs in a word. 48 | word is represented as tuple of symbols (symbols being variable-length strings) 49 | """ 50 | pairs = set() 51 | prev_char = word[0] 52 | for char in word[1:]: 53 | pairs.add((prev_char, char)) 54 | prev_char = char 55 | return pairs 56 | 57 | def text_standardize(text): 58 | """ 59 | fixes some issues the spacy tokenizer had on books corpus 60 | also does some whitespace standardization 61 | """ 62 | text = text.replace('—', '-') 63 | text = text.replace('–', '-') 64 | text = text.replace('―', '-') 65 | text = text.replace('…', '...') 66 | text = text.replace('´', "'") 67 | text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text) 68 | text = re.sub(r'\s*\n\s*', ' \n ', text) 69 | text = re.sub(r'[^\S\n]+', ' ', text) 70 | return text.strip() 71 | 72 | class OpenAIGPTTokenizer(object): 73 | """ 74 | BPE tokenizer. Peculiarities: 75 | - lower case all inputs 76 | - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. 77 | - argument special_tokens and function set_special_tokens: 78 | can be used to add additional symbols (ex: "__classify__") to a vocabulary. 79 | """ 80 | @classmethod 81 | def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): 82 | """ 83 | Instantiate a PreTrainedBertModel from a pre-trained model file. 84 | Download and cache the pre-trained model file if needed. 85 | """ 86 | if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: 87 | vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] 88 | merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path] 89 | else: 90 | vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) 91 | merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) 92 | # redirect to the cache, if necessary 93 | try: 94 | resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) 95 | resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir) 96 | except EnvironmentError: 97 | logger.error( 98 | "Model name '{}' was not found in model name list ({}). " 99 | "We assumed '{}' was a path or url but couldn't find files {} and {} " 100 | "at this path or url.".format( 101 | pretrained_model_name_or_path, 102 | ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), 103 | pretrained_model_name_or_path, 104 | vocab_file, merges_file)) 105 | return None 106 | if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file: 107 | logger.info("loading vocabulary file {}".format(vocab_file)) 108 | logger.info("loading merges file {}".format(merges_file)) 109 | else: 110 | logger.info("loading vocabulary file {} from cache at {}".format( 111 | vocab_file, resolved_vocab_file)) 112 | logger.info("loading merges file {} from cache at {}".format( 113 | merges_file, resolved_merges_file)) 114 | if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: 115 | # if we're using a pretrained model, ensure the tokenizer wont index sequences longer 116 | # than the number of positional embeddings 117 | max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] 118 | kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) 119 | # Instantiate tokenizer. 120 | tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs) 121 | return tokenizer 122 | 123 | def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None): 124 | try: 125 | import ftfy 126 | import spacy 127 | self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) 128 | self.fix_text = ftfy.fix_text 129 | except ImportError: 130 | logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") 131 | self.nlp = BasicTokenizer(do_lower_case=True, 132 | never_split=special_tokens if special_tokens is not None else []) 133 | self.fix_text = None 134 | 135 | self.max_len = max_len if max_len is not None else int(1e12) 136 | self.encoder = json.load(open(vocab_file, encoding="utf-8")) 137 | self.decoder = {v:k for k,v in self.encoder.items()} 138 | merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] 139 | merges = [tuple(merge.split()) for merge in merges] 140 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 141 | self.cache = {} 142 | self.set_special_tokens(special_tokens) 143 | 144 | def __len__(self): 145 | return len(self.encoder) + len(self.special_tokens) 146 | 147 | def set_special_tokens(self, special_tokens): 148 | """ Add a list of additional tokens to the encoder. 149 | The additional tokens are indexed starting from the last index of the 150 | current vocabulary in the order of the `special_tokens` list. 151 | """ 152 | if not special_tokens: 153 | self.special_tokens = {} 154 | self.special_tokens_decoder = {} 155 | return 156 | self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)) 157 | self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()} 158 | if self.fix_text is None: 159 | # Using BERT's BasicTokenizer: we can update the tokenizer 160 | self.nlp.never_split = special_tokens 161 | logger.info("Special tokens {}".format(self.special_tokens)) 162 | 163 | def bpe(self, token): 164 | word = tuple(token[:-1]) + (token[-1] + '',) 165 | if token in self.cache: 166 | return self.cache[token] 167 | pairs = get_pairs(word) 168 | 169 | if not pairs: 170 | return token+'' 171 | 172 | while True: 173 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) 174 | if bigram not in self.bpe_ranks: 175 | break 176 | first, second = bigram 177 | new_word = [] 178 | i = 0 179 | while i < len(word): 180 | try: 181 | j = word.index(first, i) 182 | new_word.extend(word[i:j]) 183 | i = j 184 | except: 185 | new_word.extend(word[i:]) 186 | break 187 | 188 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 189 | new_word.append(first+second) 190 | i += 2 191 | else: 192 | new_word.append(word[i]) 193 | i += 1 194 | new_word = tuple(new_word) 195 | word = new_word 196 | if len(word) == 1: 197 | break 198 | else: 199 | pairs = get_pairs(word) 200 | word = ' '.join(word) 201 | if word == '\n ': 202 | word = '\n' 203 | self.cache[token] = word 204 | return word 205 | 206 | def tokenize(self, text): 207 | """ Tokenize a string. """ 208 | split_tokens = [] 209 | if self.fix_text is None: 210 | # Using BERT's BasicTokenizer 211 | text = self.nlp.tokenize(text) 212 | for token in text: 213 | split_tokens.extend([t for t in self.bpe(token).split(' ')]) 214 | else: 215 | # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) 216 | text = self.nlp(text_standardize(self.fix_text(text))) 217 | for token in text: 218 | split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')]) 219 | return split_tokens 220 | 221 | def convert_tokens_to_ids(self, tokens): 222 | """ Converts a sequence of tokens into ids using the vocab. """ 223 | ids = [] 224 | if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)): 225 | if tokens in self.special_tokens: 226 | return self.special_tokens[tokens] 227 | else: 228 | return self.encoder.get(tokens, 0) 229 | for token in tokens: 230 | if token in self.special_tokens: 231 | ids.append(self.special_tokens[token]) 232 | else: 233 | ids.append(self.encoder.get(token, 0)) 234 | if len(ids) > self.max_len: 235 | raise ValueError( 236 | "Token indices sequence length is longer than the specified maximum " 237 | " sequence length for this OpenAI GPT model ({} > {}). Running this" 238 | " sequence through the model will result in indexing errors".format(len(ids), self.max_len) 239 | ) 240 | return ids 241 | 242 | def convert_ids_to_tokens(self, ids, skip_special_tokens=False): 243 | """Converts a sequence of ids in BPE tokens using the vocab.""" 244 | tokens = [] 245 | for i in ids: 246 | if i in self.special_tokens_decoder: 247 | if not skip_special_tokens: 248 | tokens.append(self.special_tokens_decoder[i]) 249 | else: 250 | tokens.append(self.decoder[i]) 251 | return tokens 252 | 253 | def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=False): 254 | """Converts a sequence of ids in a string.""" 255 | tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens) 256 | out_string = ''.join(tokens).replace('', ' ').strip() 257 | if clean_up_tokenization_spaces: 258 | out_string = out_string.replace('', '') 259 | out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ',' 260 | ).replace(" n't", "n't").replace(" 'm", "'m").replace(" 're", "'re").replace(" do not", " don't" 261 | ).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m " 262 | ).replace(" 've", "'ve") 263 | return out_string 264 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/examples/extract_features.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Extract pre-computed feature vectors from a PyTorch BERT model.""" 16 | 17 | #兼容python3.X的使用特性 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import argparse 23 | import collections #对字典、元祖等数据结构的补充 24 | import logging #日志logging模块 25 | import json 26 | import re 27 | 28 | import torch 29 | from torch.utils.data import TensorDataset, DataLoader, SequentialSampler 30 | from torch.utils.data.distributed import DistributedSampler 31 | 32 | from pytorch_pretrained_bert.tokenization import BertTokenizer 33 | from pytorch_pretrained_bert.modeling import BertModel 34 | 35 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 36 | datefmt = '%m/%d/%Y %H:%M:%S', 37 | level = logging.INFO) 38 | logger = logging.getLogger(__name__) 39 | 40 | 41 | class InputExample(object): 42 | 43 | def __init__(self, unique_id, text_a, text_b): 44 | self.unique_id = unique_id 45 | self.text_a = text_a 46 | self.text_b = text_b 47 | 48 | 49 | class InputFeatures(object): 50 | """A single set of features of data.""" 51 | 52 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): 53 | self.unique_id = unique_id 54 | self.tokens = tokens 55 | self.input_ids = input_ids 56 | self.input_mask = input_mask 57 | self.input_type_ids = input_type_ids 58 | 59 | 60 | def convert_examples_to_features(examples, seq_length, tokenizer): 61 | """Loads a data file into a list of `InputBatch`s.""" 62 | 63 | features = [] 64 | for (ex_index, example) in enumerate(examples): 65 | tokens_a = tokenizer.tokenize(example.text_a) 66 | 67 | tokens_b = None 68 | if example.text_b: 69 | tokens_b = tokenizer.tokenize(example.text_b) 70 | 71 | if tokens_b: 72 | # Modifies `tokens_a` and `tokens_b` in place so that the total 73 | # length is less than the specified length. 74 | # Account for [CLS], [SEP], [SEP] with "- 3" 75 | _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) 76 | else: 77 | # Account for [CLS] and [SEP] with "- 2" 78 | if len(tokens_a) > seq_length - 2: 79 | tokens_a = tokens_a[0:(seq_length - 2)] 80 | 81 | # The convention in BERT is: 82 | # (a) For sequence pairs: 83 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 84 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 85 | # (b) For single sequences: 86 | # tokens: [CLS] the dog is hairy . [SEP] 87 | # type_ids: 0 0 0 0 0 0 0 88 | # 89 | # Where "type_ids" are used to indicate whether this is the first 90 | # sequence or the second sequence. The embedding vectors for `type=0` and 91 | # `type=1` were learned during pre-training and are added to the wordpiece 92 | # embedding vector (and position vector). This is not *strictly* necessary 93 | # since the [SEP] token unambigiously separates the sequences, but it makes 94 | # it easier for the model to learn the concept of sequences. 95 | # 96 | # For classification tasks, the first vector (corresponding to [CLS]) is 97 | # used as as the "sentence vector". Note that this only makes sense because 98 | # the entire model is fine-tuned. 99 | tokens = [] 100 | input_type_ids = [] 101 | tokens.append("[CLS]") 102 | input_type_ids.append(0) 103 | for token in tokens_a: 104 | tokens.append(token) 105 | input_type_ids.append(0) 106 | tokens.append("[SEP]") 107 | input_type_ids.append(0) 108 | 109 | if tokens_b: 110 | for token in tokens_b: 111 | tokens.append(token) 112 | input_type_ids.append(1) 113 | tokens.append("[SEP]") 114 | input_type_ids.append(1) 115 | 116 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 117 | 118 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 119 | # tokens are attended to. 120 | input_mask = [1] * len(input_ids) 121 | 122 | # Zero-pad up to the sequence length. 123 | while len(input_ids) < seq_length: 124 | input_ids.append(0) 125 | input_mask.append(0) 126 | input_type_ids.append(0) 127 | 128 | assert len(input_ids) == seq_length 129 | assert len(input_mask) == seq_length 130 | assert len(input_type_ids) == seq_length 131 | 132 | if ex_index < 5: 133 | logger.info("*** Example ***") 134 | logger.info("unique_id: %s" % (example.unique_id)) 135 | logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) 136 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 137 | logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 138 | logger.info( 139 | "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) 140 | 141 | features.append( 142 | InputFeatures( 143 | unique_id=example.unique_id, 144 | tokens=tokens, 145 | input_ids=input_ids, 146 | input_mask=input_mask, 147 | input_type_ids=input_type_ids)) 148 | return features 149 | 150 | 151 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 152 | """Truncates a sequence pair in place to the maximum length.""" 153 | 154 | # This is a simple heuristic which will always truncate the longer sequence 155 | # one token at a time. This makes more sense than truncating an equal percent 156 | # of tokens from each, since if one sequence is very short then each token 157 | # that's truncated likely contains more information than a longer sequence. 158 | while True: 159 | total_length = len(tokens_a) + len(tokens_b) 160 | if total_length <= max_length: 161 | break 162 | if len(tokens_a) > len(tokens_b): 163 | tokens_a.pop() 164 | else: 165 | tokens_b.pop() 166 | 167 | 168 | def read_examples(input_file): 169 | """Read a list of `InputExample`s from an input file.""" 170 | examples = [] 171 | unique_id = 0 172 | with open(input_file, "r", encoding='utf-8') as reader: 173 | while True: 174 | line = reader.readline() 175 | if not line: 176 | break 177 | line = line.strip() 178 | text_a = None 179 | text_b = None 180 | m = re.match(r"^(.*) \|\|\| (.*)$", line) 181 | if m is None: 182 | text_a = line 183 | else: 184 | text_a = m.group(1) 185 | text_b = m.group(2) 186 | examples.append( 187 | InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) 188 | unique_id += 1 189 | return examples 190 | 191 | 192 | def main(): 193 | parser = argparse.ArgumentParser() 194 | 195 | ## Required parameters 196 | parser.add_argument("--input_file", default=None, type=str, required=True) 197 | parser.add_argument("--output_file", default=None, type=str, required=True) 198 | parser.add_argument("--bert_model", default=None, type=str, required=True, 199 | help="Bert pre-trained model selected in the list: bert-base-uncased, " 200 | "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") 201 | 202 | ## Other parameters 203 | parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") 204 | parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) 205 | parser.add_argument("--max_seq_length", default=128, type=int, 206 | help="The maximum total input sequence length after WordPiece tokenization. Sequences longer " 207 | "than this will be truncated, and sequences shorter than this will be padded.") 208 | parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") 209 | parser.add_argument("--local_rank", 210 | type=int, 211 | default=-1, 212 | help = "local_rank for distributed training on gpus") 213 | parser.add_argument("--no_cuda", 214 | action='store_true', 215 | help="Whether not to use CUDA when available") 216 | 217 | args = parser.parse_args() 218 | 219 | if args.local_rank == -1 or args.no_cuda: 220 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 221 | n_gpu = torch.cuda.device_count() 222 | else: 223 | device = torch.device("cuda", args.local_rank) 224 | n_gpu = 1 225 | # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 226 | torch.distributed.init_process_group(backend='nccl') 227 | logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1))) 228 | 229 | layer_indexes = [int(x) for x in args.layers.split(",")] 230 | 231 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 232 | 233 | examples = read_examples(args.input_file) 234 | 235 | features = convert_examples_to_features( 236 | examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer) 237 | 238 | unique_id_to_feature = {} 239 | for feature in features: 240 | unique_id_to_feature[feature.unique_id] = feature 241 | 242 | model = BertModel.from_pretrained(args.bert_model) 243 | model.to(device) 244 | 245 | if args.local_rank != -1: 246 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], 247 | output_device=args.local_rank) 248 | elif n_gpu > 1: 249 | model = torch.nn.DataParallel(model) 250 | 251 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) 252 | all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) 253 | all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) 254 | 255 | eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) 256 | if args.local_rank == -1: 257 | eval_sampler = SequentialSampler(eval_data) 258 | else: 259 | eval_sampler = DistributedSampler(eval_data) 260 | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) 261 | 262 | model.eval() 263 | with open(args.output_file, "w", encoding='utf-8') as writer: 264 | for input_ids, input_mask, example_indices in eval_dataloader: 265 | input_ids = input_ids.to(device) 266 | input_mask = input_mask.to(device) 267 | 268 | all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) 269 | all_encoder_layers = all_encoder_layers 270 | 271 | for b, example_index in enumerate(example_indices): 272 | feature = features[example_index.item()] 273 | unique_id = int(feature.unique_id) 274 | # feature = unique_id_to_feature[unique_id] 275 | output_json = collections.OrderedDict() 276 | output_json["linex_index"] = unique_id 277 | all_out_features = [] 278 | for (i, token) in enumerate(feature.tokens): 279 | all_layers = [] 280 | for (j, layer_index) in enumerate(layer_indexes): 281 | layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy() 282 | layer_output = layer_output[b] 283 | layers = collections.OrderedDict() 284 | layers["index"] = layer_index 285 | layers["values"] = [ 286 | round(x.item(), 6) for x in layer_output[i] 287 | ] 288 | all_layers.append(layers) 289 | out_features = collections.OrderedDict() 290 | out_features["token"] = token 291 | out_features["layers"] = all_layers 292 | all_out_features.append(out_features) 293 | output_json["features"] = all_out_features 294 | writer.write(json.dumps(output_json) + "\n") 295 | 296 | 297 | if __name__ == "__main__": 298 | main() 299 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/tests/modeling_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import json 21 | import random 22 | 23 | import torch 24 | 25 | from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM, 26 | BertForNextSentencePrediction, BertForPreTraining, 27 | BertForQuestionAnswering, BertForSequenceClassification, 28 | BertForTokenClassification) 29 | 30 | 31 | class BertModelTest(unittest.TestCase): 32 | class BertModelTester(object): 33 | 34 | def __init__(self, 35 | parent, 36 | batch_size=13, 37 | seq_length=7, 38 | is_training=True, 39 | use_input_mask=True, 40 | use_token_type_ids=True, 41 | use_labels=True, 42 | vocab_size=99, 43 | hidden_size=32, 44 | num_hidden_layers=5, 45 | num_attention_heads=4, 46 | intermediate_size=37, 47 | hidden_act="gelu", 48 | hidden_dropout_prob=0.1, 49 | attention_probs_dropout_prob=0.1, 50 | max_position_embeddings=512, 51 | type_vocab_size=16, 52 | type_sequence_label_size=2, 53 | initializer_range=0.02, 54 | num_labels=3, 55 | scope=None): 56 | self.parent = parent 57 | self.batch_size = batch_size 58 | self.seq_length = seq_length 59 | self.is_training = is_training 60 | self.use_input_mask = use_input_mask 61 | self.use_token_type_ids = use_token_type_ids 62 | self.use_labels = use_labels 63 | self.vocab_size = vocab_size 64 | self.hidden_size = hidden_size 65 | self.num_hidden_layers = num_hidden_layers 66 | self.num_attention_heads = num_attention_heads 67 | self.intermediate_size = intermediate_size 68 | self.hidden_act = hidden_act 69 | self.hidden_dropout_prob = hidden_dropout_prob 70 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 71 | self.max_position_embeddings = max_position_embeddings 72 | self.type_vocab_size = type_vocab_size 73 | self.type_sequence_label_size = type_sequence_label_size 74 | self.initializer_range = initializer_range 75 | self.num_labels = num_labels 76 | self.scope = scope 77 | 78 | def prepare_config_and_inputs(self): 79 | input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size) 80 | 81 | input_mask = None 82 | if self.use_input_mask: 83 | input_mask = BertModelTest.ids_tensor([self.batch_size, self.seq_length], vocab_size=2) 84 | 85 | token_type_ids = None 86 | if self.use_token_type_ids: 87 | token_type_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) 88 | 89 | sequence_labels = None 90 | token_labels = None 91 | if self.use_labels: 92 | sequence_labels = BertModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size) 93 | token_labels = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.num_labels) 94 | 95 | config = BertConfig( 96 | vocab_size_or_config_json_file=self.vocab_size, 97 | hidden_size=self.hidden_size, 98 | num_hidden_layers=self.num_hidden_layers, 99 | num_attention_heads=self.num_attention_heads, 100 | intermediate_size=self.intermediate_size, 101 | hidden_act=self.hidden_act, 102 | hidden_dropout_prob=self.hidden_dropout_prob, 103 | attention_probs_dropout_prob=self.attention_probs_dropout_prob, 104 | max_position_embeddings=self.max_position_embeddings, 105 | type_vocab_size=self.type_vocab_size, 106 | initializer_range=self.initializer_range) 107 | 108 | return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels 109 | 110 | def check_loss_output(self, result): 111 | self.parent.assertListEqual( 112 | list(result["loss"].size()), 113 | []) 114 | 115 | def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): 116 | model = BertModel(config=config) 117 | model.eval() 118 | all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) 119 | outputs = { 120 | "sequence_output": all_encoder_layers[-1], 121 | "pooled_output": pooled_output, 122 | "all_encoder_layers": all_encoder_layers, 123 | } 124 | return outputs 125 | 126 | def check_bert_model_output(self, result): 127 | self.parent.assertListEqual( 128 | [size for layer in result["all_encoder_layers"] for size in layer.size()], 129 | [self.batch_size, self.seq_length, self.hidden_size] * self.num_hidden_layers) 130 | self.parent.assertListEqual( 131 | list(result["sequence_output"].size()), 132 | [self.batch_size, self.seq_length, self.hidden_size]) 133 | self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) 134 | 135 | 136 | def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): 137 | model = BertForMaskedLM(config=config) 138 | model.eval() 139 | loss = model(input_ids, token_type_ids, input_mask, token_labels) 140 | prediction_scores = model(input_ids, token_type_ids, input_mask) 141 | outputs = { 142 | "loss": loss, 143 | "prediction_scores": prediction_scores, 144 | } 145 | return outputs 146 | 147 | def check_bert_for_masked_lm_output(self, result): 148 | self.parent.assertListEqual( 149 | list(result["prediction_scores"].size()), 150 | [self.batch_size, self.seq_length, self.vocab_size]) 151 | 152 | def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): 153 | model = BertForNextSentencePrediction(config=config) 154 | model.eval() 155 | loss = model(input_ids, token_type_ids, input_mask, sequence_labels) 156 | seq_relationship_score = model(input_ids, token_type_ids, input_mask) 157 | outputs = { 158 | "loss": loss, 159 | "seq_relationship_score": seq_relationship_score, 160 | } 161 | return outputs 162 | 163 | def check_bert_for_next_sequence_prediction_output(self, result): 164 | self.parent.assertListEqual( 165 | list(result["seq_relationship_score"].size()), 166 | [self.batch_size, 2]) 167 | 168 | 169 | def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): 170 | model = BertForPreTraining(config=config) 171 | model.eval() 172 | loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels) 173 | prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask) 174 | outputs = { 175 | "loss": loss, 176 | "prediction_scores": prediction_scores, 177 | "seq_relationship_score": seq_relationship_score, 178 | } 179 | return outputs 180 | 181 | def check_bert_for_pretraining_output(self, result): 182 | self.parent.assertListEqual( 183 | list(result["prediction_scores"].size()), 184 | [self.batch_size, self.seq_length, self.vocab_size]) 185 | self.parent.assertListEqual( 186 | list(result["seq_relationship_score"].size()), 187 | [self.batch_size, 2]) 188 | 189 | 190 | def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): 191 | model = BertForQuestionAnswering(config=config) 192 | model.eval() 193 | loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels) 194 | start_logits, end_logits = model(input_ids, token_type_ids, input_mask) 195 | outputs = { 196 | "loss": loss, 197 | "start_logits": start_logits, 198 | "end_logits": end_logits, 199 | } 200 | return outputs 201 | 202 | def check_bert_for_question_answering_output(self, result): 203 | self.parent.assertListEqual( 204 | list(result["start_logits"].size()), 205 | [self.batch_size, self.seq_length]) 206 | self.parent.assertListEqual( 207 | list(result["end_logits"].size()), 208 | [self.batch_size, self.seq_length]) 209 | 210 | 211 | def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): 212 | model = BertForSequenceClassification(config=config, num_labels=self.num_labels) 213 | model.eval() 214 | loss = model(input_ids, token_type_ids, input_mask, sequence_labels) 215 | logits = model(input_ids, token_type_ids, input_mask) 216 | outputs = { 217 | "loss": loss, 218 | "logits": logits, 219 | } 220 | return outputs 221 | 222 | def check_bert_for_sequence_classification_output(self, result): 223 | self.parent.assertListEqual( 224 | list(result["logits"].size()), 225 | [self.batch_size, self.num_labels]) 226 | 227 | 228 | def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): 229 | model = BertForTokenClassification(config=config, num_labels=self.num_labels) 230 | model.eval() 231 | loss = model(input_ids, token_type_ids, input_mask, token_labels) 232 | logits = model(input_ids, token_type_ids, input_mask) 233 | outputs = { 234 | "loss": loss, 235 | "logits": logits, 236 | } 237 | return outputs 238 | 239 | def check_bert_for_token_classification_output(self, result): 240 | self.parent.assertListEqual( 241 | list(result["logits"].size()), 242 | [self.batch_size, self.seq_length, self.num_labels]) 243 | 244 | 245 | def test_default(self): 246 | self.run_tester(BertModelTest.BertModelTester(self)) 247 | 248 | def test_config_to_json_string(self): 249 | config = BertConfig(vocab_size_or_config_json_file=99, hidden_size=37) 250 | obj = json.loads(config.to_json_string()) 251 | self.assertEqual(obj["vocab_size"], 99) 252 | self.assertEqual(obj["hidden_size"], 37) 253 | 254 | def run_tester(self, tester): 255 | config_and_inputs = tester.prepare_config_and_inputs() 256 | output_result = tester.create_bert_model(*config_and_inputs) 257 | tester.check_bert_model_output(output_result) 258 | 259 | output_result = tester.create_bert_for_masked_lm(*config_and_inputs) 260 | tester.check_bert_for_masked_lm_output(output_result) 261 | tester.check_loss_output(output_result) 262 | 263 | output_result = tester.create_bert_for_next_sequence_prediction(*config_and_inputs) 264 | tester.check_bert_for_next_sequence_prediction_output(output_result) 265 | tester.check_loss_output(output_result) 266 | 267 | output_result = tester.create_bert_for_pretraining(*config_and_inputs) 268 | tester.check_bert_for_pretraining_output(output_result) 269 | tester.check_loss_output(output_result) 270 | 271 | output_result = tester.create_bert_for_question_answering(*config_and_inputs) 272 | tester.check_bert_for_question_answering_output(output_result) 273 | tester.check_loss_output(output_result) 274 | 275 | output_result = tester.create_bert_for_sequence_classification(*config_and_inputs) 276 | tester.check_bert_for_sequence_classification_output(output_result) 277 | tester.check_loss_output(output_result) 278 | 279 | output_result = tester.create_bert_for_token_classification(*config_and_inputs) 280 | tester.check_bert_for_token_classification_output(output_result) 281 | tester.check_loss_output(output_result) 282 | 283 | @classmethod 284 | def ids_tensor(cls, shape, vocab_size, rng=None, name=None): 285 | """Creates a random int32 tensor of the shape within the vocab size.""" 286 | if rng is None: 287 | rng = random.Random() 288 | 289 | total_dims = 1 290 | for dim in shape: 291 | total_dims *= dim 292 | 293 | values = [] 294 | for _ in range(total_dims): 295 | values.append(rng.randint(0, vocab_size - 1)) 296 | 297 | return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous() 298 | 299 | 300 | if __name__ == "__main__": 301 | unittest.main() 302 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import collections 20 | import logging 21 | import os 22 | import unicodedata 23 | from io import open 24 | 25 | from .file_utils import cached_path 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | PRETRAINED_VOCAB_ARCHIVE_MAP = { 30 | 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 31 | 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", 32 | 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", 33 | 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", 34 | 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", 35 | 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", 36 | 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", 37 | } 38 | PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { 39 | 'bert-base-uncased': 512, 40 | 'bert-large-uncased': 512, 41 | 'bert-base-cased': 512, 42 | 'bert-large-cased': 512, 43 | 'bert-base-multilingual-uncased': 512, 44 | 'bert-base-multilingual-cased': 512, 45 | 'bert-base-chinese': 512, 46 | } 47 | VOCAB_NAME = 'vocab.txt' 48 | 49 | 50 | def load_vocab(vocab_file): 51 | """Loads a vocabulary file into a dictionary.""" 52 | vocab = collections.OrderedDict() 53 | index = 0 54 | with open(vocab_file, "r", encoding="utf-8") as reader: 55 | while True: 56 | token = reader.readline() 57 | if not token: 58 | break 59 | token = token.strip() 60 | vocab[token] = index 61 | index += 1 62 | return vocab 63 | 64 | 65 | def whitespace_tokenize(text): 66 | """Runs basic whitespace cleaning and splitting on a piece of text.""" 67 | text = text.strip() 68 | if not text: 69 | return [] 70 | tokens = text.split() 71 | return tokens 72 | 73 | 74 | class BertTokenizer(object): 75 | """Runs end-to-end tokenization: punctuation splitting + wordpiece""" 76 | 77 | def __init__(self, vocab_file, do_lower_case=True, max_len=None, 78 | never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): 79 | if not os.path.isfile(vocab_file): 80 | raise ValueError( 81 | "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " 82 | "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) 83 | self.vocab = load_vocab(vocab_file) 84 | self.ids_to_tokens = collections.OrderedDict( 85 | [(ids, tok) for tok, ids in self.vocab.items()]) 86 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, 87 | never_split=never_split) 88 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 89 | self.max_len = max_len if max_len is not None else int(1e12) 90 | 91 | def tokenize(self, text): 92 | split_tokens = [] 93 | for token in self.basic_tokenizer.tokenize(text): 94 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 95 | split_tokens.append(sub_token) 96 | return split_tokens 97 | 98 | def convert_tokens_to_ids(self, tokens): 99 | """Converts a sequence of tokens into ids using the vocab.""" 100 | ids = [] 101 | for token in tokens: 102 | ids.append(self.vocab[token]) 103 | if len(ids) > self.max_len: 104 | raise ValueError( 105 | "Token indices sequence length is longer than the specified maximum " 106 | " sequence length for this BERT model ({} > {}). Running this" 107 | " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) 108 | ) 109 | return ids 110 | 111 | def convert_ids_to_tokens(self, ids): 112 | """Converts a sequence of ids in wordpiece tokens using the vocab.""" 113 | tokens = [] 114 | for i in ids: 115 | tokens.append(self.ids_to_tokens[i]) 116 | return tokens 117 | 118 | @classmethod 119 | def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): 120 | """ 121 | Instantiate a PreTrainedBertModel from a pre-trained model file. 122 | Download and cache the pre-trained model file if needed. 123 | """ 124 | if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: 125 | vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] 126 | else: 127 | vocab_file = pretrained_model_name_or_path 128 | if os.path.isdir(vocab_file): 129 | vocab_file = os.path.join(vocab_file, VOCAB_NAME) 130 | # redirect to the cache, if necessary 131 | try: 132 | resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) 133 | except EnvironmentError: 134 | logger.error( 135 | "Model name '{}' was not found in model name list ({}). " 136 | "We assumed '{}' was a path or url but couldn't find any file " 137 | "associated to this path or url.".format( 138 | pretrained_model_name_or_path, 139 | ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), 140 | vocab_file)) 141 | return None 142 | if resolved_vocab_file == vocab_file: 143 | logger.info("loading vocabulary file {}".format(vocab_file)) 144 | else: 145 | logger.info("loading vocabulary file {} from cache at {}".format( 146 | vocab_file, resolved_vocab_file)) 147 | if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: 148 | # if we're using a pretrained model, ensure the tokenizer wont index sequences longer 149 | # than the number of positional embeddings 150 | max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] 151 | kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) 152 | # Instantiate tokenizer. 153 | tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) 154 | return tokenizer 155 | 156 | 157 | class BasicTokenizer(object): 158 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 159 | 160 | def __init__(self, 161 | do_lower_case=True, 162 | never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): 163 | """Constructs a BasicTokenizer. 164 | 165 | Args: 166 | do_lower_case: Whether to lower case the input. 167 | """ 168 | self.do_lower_case = do_lower_case 169 | self.never_split = never_split 170 | 171 | def tokenize(self, text): 172 | """Tokenizes a piece of text.""" 173 | text = self._clean_text(text) 174 | # This was added on November 1st, 2018 for the multilingual and Chinese 175 | # models. This is also applied to the English models now, but it doesn't 176 | # matter since the English models were not trained on any Chinese data 177 | # and generally don't have any Chinese data in them (there are Chinese 178 | # characters in the vocabulary because Wikipedia does have some Chinese 179 | # words in the English Wikipedia.). 180 | text = self._tokenize_chinese_chars(text) 181 | orig_tokens = whitespace_tokenize(text) 182 | split_tokens = [] 183 | for token in orig_tokens: 184 | if self.do_lower_case and token not in self.never_split: 185 | token = token.lower() 186 | token = self._run_strip_accents(token) 187 | split_tokens.extend(self._run_split_on_punc(token)) 188 | 189 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 190 | return output_tokens 191 | 192 | def _run_strip_accents(self, text): 193 | """Strips accents from a piece of text.""" 194 | text = unicodedata.normalize("NFD", text) 195 | output = [] 196 | for char in text: 197 | cat = unicodedata.category(char) 198 | if cat == "Mn": 199 | continue 200 | output.append(char) 201 | return "".join(output) 202 | 203 | def _run_split_on_punc(self, text): 204 | """Splits punctuation on a piece of text.""" 205 | if text in self.never_split: 206 | return [text] 207 | chars = list(text) 208 | i = 0 209 | start_new_word = True 210 | output = [] 211 | while i < len(chars): 212 | char = chars[i] 213 | if _is_punctuation(char): 214 | output.append([char]) 215 | start_new_word = True 216 | else: 217 | if start_new_word: 218 | output.append([]) 219 | start_new_word = False 220 | output[-1].append(char) 221 | i += 1 222 | 223 | return ["".join(x) for x in output] 224 | 225 | def _tokenize_chinese_chars(self, text): 226 | """Adds whitespace around any CJK character.""" 227 | output = [] 228 | for char in text: 229 | cp = ord(char) 230 | if self._is_chinese_char(cp): 231 | output.append(" ") 232 | output.append(char) 233 | output.append(" ") 234 | else: 235 | output.append(char) 236 | return "".join(output) 237 | 238 | def _is_chinese_char(self, cp): 239 | """Checks whether CP is the codepoint of a CJK character.""" 240 | # This defines a "chinese character" as anything in the CJK Unicode block: 241 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 242 | # 243 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 244 | # despite its name. The modern Korean Hangul alphabet is a different block, 245 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 246 | # space-separated words, so they are not treated specially and handled 247 | # like the all of the other languages. 248 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 249 | (cp >= 0x3400 and cp <= 0x4DBF) or # 250 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 251 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 252 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 253 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 254 | (cp >= 0xF900 and cp <= 0xFAFF) or # 255 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 256 | return True 257 | 258 | return False 259 | 260 | def _clean_text(self, text): 261 | """Performs invalid character removal and whitespace cleanup on text.""" 262 | output = [] 263 | for char in text: 264 | cp = ord(char) 265 | if cp == 0 or cp == 0xfffd or _is_control(char): 266 | continue 267 | if _is_whitespace(char): 268 | output.append(" ") 269 | else: 270 | output.append(char) 271 | return "".join(output) 272 | 273 | 274 | class WordpieceTokenizer(object): 275 | """Runs WordPiece tokenization.""" 276 | 277 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): 278 | self.vocab = vocab 279 | self.unk_token = unk_token 280 | self.max_input_chars_per_word = max_input_chars_per_word 281 | 282 | def tokenize(self, text): 283 | """Tokenizes a piece of text into its word pieces. 284 | 285 | This uses a greedy longest-match-first algorithm to perform tokenization 286 | using the given vocabulary. 287 | 288 | For example: 289 | input = "unaffable" 290 | output = ["un", "##aff", "##able"] 291 | 292 | Args: 293 | text: A single token or whitespace separated tokens. This should have 294 | already been passed through `BasicTokenizer`. 295 | 296 | Returns: 297 | A list of wordpiece tokens. 298 | """ 299 | 300 | output_tokens = [] 301 | for token in whitespace_tokenize(text): 302 | chars = list(token) 303 | if len(chars) > self.max_input_chars_per_word: 304 | output_tokens.append(self.unk_token) 305 | continue 306 | 307 | is_bad = False 308 | start = 0 309 | sub_tokens = [] 310 | while start < len(chars): 311 | end = len(chars) 312 | cur_substr = None 313 | while start < end: 314 | substr = "".join(chars[start:end]) 315 | if start > 0: 316 | substr = "##" + substr 317 | if substr in self.vocab: 318 | cur_substr = substr 319 | break 320 | end -= 1 321 | if cur_substr is None: 322 | is_bad = True 323 | break 324 | sub_tokens.append(cur_substr) 325 | start = end 326 | 327 | if is_bad: 328 | output_tokens.append(self.unk_token) 329 | else: 330 | output_tokens.extend(sub_tokens) 331 | return output_tokens 332 | 333 | 334 | def _is_whitespace(char): 335 | """Checks whether `chars` is a whitespace character.""" 336 | # \t, \n, and \r are technically contorl characters but we treat them 337 | # as whitespace since they are generally considered as such. 338 | if char == " " or char == "\t" or char == "\n" or char == "\r": 339 | return True 340 | cat = unicodedata.category(char) 341 | if cat == "Zs": 342 | return True 343 | return False 344 | 345 | 346 | def _is_control(char): 347 | """Checks whether `chars` is a control character.""" 348 | # These are technically control characters but we count them as whitespace 349 | # characters. 350 | if char == "\t" or char == "\n" or char == "\r": 351 | return False 352 | cat = unicodedata.category(char) 353 | if cat.startswith("C"): 354 | return True 355 | return False 356 | 357 | 358 | def _is_punctuation(char): 359 | """Checks whether `chars` is a punctuation character.""" 360 | cp = ord(char) 361 | # We treat all non-letter/number ASCII as punctuation. 362 | # Characters such as "^", "$", and "`" are not in the Unicode 363 | # Punctuation class but we treat them as punctuation anyways, for 364 | # consistency. 365 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 366 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 367 | return True 368 | cat = unicodedata.category(char) 369 | if cat.startswith("P"): 370 | return True 371 | return False 372 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ Utilities for PyTorch Transformer XL model. 17 | Directly adapted from https://github.com/kimiyoung/transformer-xl. 18 | """ 19 | 20 | from collections import defaultdict 21 | 22 | import numpy as np 23 | 24 | import torch 25 | import torch.nn as nn 26 | import torch.nn.functional as F 27 | 28 | # CUDA_MAJOR = int(torch.version.cuda.split('.')[0]) 29 | # CUDA_MINOR = int(torch.version.cuda.split('.')[1]) 30 | 31 | class ProjectedAdaptiveLogSoftmax(nn.Module): 32 | def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, 33 | keep_order=False): 34 | super(ProjectedAdaptiveLogSoftmax, self).__init__() 35 | 36 | self.n_token = n_token 37 | self.d_embed = d_embed 38 | self.d_proj = d_proj 39 | 40 | self.cutoffs = cutoffs + [n_token] 41 | self.cutoff_ends = [0] + self.cutoffs 42 | self.div_val = div_val 43 | 44 | self.shortlist_size = self.cutoffs[0] 45 | self.n_clusters = len(self.cutoffs) - 1 46 | self.head_size = self.shortlist_size + self.n_clusters 47 | 48 | if self.n_clusters > 0: 49 | self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed)) 50 | self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) 51 | 52 | self.out_layers = nn.ModuleList() 53 | self.out_projs = nn.ParameterList() 54 | 55 | if div_val == 1: 56 | for i in range(len(self.cutoffs)): 57 | if d_proj != d_embed: 58 | self.out_projs.append( 59 | nn.Parameter(torch.Tensor(d_proj, d_embed)) 60 | ) 61 | else: 62 | self.out_projs.append(None) 63 | 64 | self.out_layers.append(nn.Linear(d_embed, n_token)) 65 | else: 66 | for i in range(len(self.cutoffs)): 67 | l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] 68 | d_emb_i = d_embed // (div_val ** i) 69 | 70 | self.out_projs.append( 71 | nn.Parameter(torch.Tensor(d_proj, d_emb_i)) 72 | ) 73 | 74 | self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx)) 75 | 76 | self.keep_order = keep_order 77 | 78 | def _compute_logit(self, hidden, weight, bias, proj): 79 | if proj is None: 80 | logit = F.linear(hidden, weight, bias=bias) 81 | else: 82 | # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1: 83 | proj_hid = F.linear(hidden, proj.t().contiguous()) 84 | logit = F.linear(proj_hid, weight, bias=bias) 85 | # else: 86 | # logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t())) 87 | # if bias is not None: 88 | # logit = logit + bias 89 | 90 | return logit 91 | 92 | def forward(self, hidden, target=None, keep_order=False): 93 | ''' 94 | Params: 95 | hidden :: [len*bsz x d_proj] 96 | target :: [len*bsz] 97 | Return: 98 | if target is None: 99 | out :: [len*bsz] Negative log likelihood 100 | else: 101 | out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary 102 | We could replace this implementation by the native PyTorch one 103 | if their's had an option to set bias on all clusters in the native one. 104 | here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138 105 | ''' 106 | 107 | if target is not None: 108 | target = target.view(-1) 109 | if hidden.size(0) != target.size(0): 110 | raise RuntimeError('Input and target should have the same size ' 111 | 'in the batch dimension.') 112 | 113 | if self.n_clusters == 0: 114 | logit = self._compute_logit(hidden, self.out_layers[0].weight, 115 | self.out_layers[0].bias, self.out_projs[0]) 116 | if target is not None: 117 | output = -F.log_softmax(logit, dim=-1) \ 118 | .gather(1, target.unsqueeze(1)).squeeze(1) 119 | else: 120 | output = F.log_softmax(logit, dim=-1) 121 | else: 122 | # construct weights and biases 123 | weights, biases = [], [] 124 | for i in range(len(self.cutoffs)): 125 | if self.div_val == 1: 126 | l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] 127 | weight_i = self.out_layers[0].weight[l_idx:r_idx] 128 | bias_i = self.out_layers[0].bias[l_idx:r_idx] 129 | else: 130 | weight_i = self.out_layers[i].weight 131 | bias_i = self.out_layers[i].bias 132 | 133 | if i == 0: 134 | weight_i = torch.cat( 135 | [weight_i, self.cluster_weight], dim=0) 136 | bias_i = torch.cat( 137 | [bias_i, self.cluster_bias], dim=0) 138 | 139 | weights.append(weight_i) 140 | biases.append(bias_i) 141 | 142 | head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] 143 | 144 | head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) 145 | head_logprob = F.log_softmax(head_logit, dim=1) 146 | 147 | if target is None: 148 | out = hidden.new_empty((head_logit.size(0), self.n_token)) 149 | else: 150 | out = torch.zeros_like(target, dtype=hidden.dtype, device=hidden.device) 151 | 152 | offset = 0 153 | cutoff_values = [0] + self.cutoffs 154 | for i in range(len(cutoff_values) - 1): 155 | l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1] 156 | 157 | if target is not None: 158 | mask_i = (target >= l_idx) & (target < r_idx) 159 | indices_i = mask_i.nonzero().squeeze() 160 | 161 | if indices_i.numel() == 0: 162 | continue 163 | 164 | target_i = target.index_select(0, indices_i) - l_idx 165 | head_logprob_i = head_logprob.index_select(0, indices_i) 166 | hidden_i = hidden.index_select(0, indices_i) 167 | else: 168 | hidden_i = hidden 169 | 170 | if i == 0: 171 | if target is not None: 172 | logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1) 173 | else: 174 | out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]] 175 | else: 176 | weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] 177 | 178 | tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i) 179 | tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) 180 | cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster 181 | if target is not None: 182 | logprob_i = head_logprob_i[:, cluster_prob_idx] \ 183 | + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1) 184 | else: 185 | logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i 186 | out[:, l_idx:r_idx] = logprob_i 187 | 188 | if target is not None: 189 | if (hasattr(self, 'keep_order') and self.keep_order) or keep_order: 190 | out.index_copy_(0, indices_i, -logprob_i) 191 | else: 192 | out[offset:offset+logprob_i.size(0)].copy_(-logprob_i) 193 | offset += logprob_i.size(0) 194 | 195 | return out 196 | 197 | 198 | def log_prob(self, hidden): 199 | r""" Computes log probabilities for all :math:`n\_classes` 200 | From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py 201 | Args: 202 | hidden (Tensor): a minibatch of examples 203 | Returns: 204 | log-probabilities of for each class :math:`c` 205 | in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a 206 | parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor. 207 | Shape: 208 | - Input: :math:`(N, in\_features)` 209 | - Output: :math:`(N, n\_classes)` 210 | """ 211 | if self.n_clusters == 0: 212 | logit = self._compute_logit(hidden, self.out_layers[0].weight, 213 | self.out_layers[0].bias, self.out_projs[0]) 214 | return F.log_softmax(logit, dim=-1) 215 | else: 216 | # construct weights and biases 217 | weights, biases = [], [] 218 | for i in range(len(self.cutoffs)): 219 | if self.div_val == 1: 220 | l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] 221 | weight_i = self.out_layers[0].weight[l_idx:r_idx] 222 | bias_i = self.out_layers[0].bias[l_idx:r_idx] 223 | else: 224 | weight_i = self.out_layers[i].weight 225 | bias_i = self.out_layers[i].bias 226 | 227 | if i == 0: 228 | weight_i = torch.cat( 229 | [weight_i, self.cluster_weight], dim=0) 230 | bias_i = torch.cat( 231 | [bias_i, self.cluster_bias], dim=0) 232 | 233 | weights.append(weight_i) 234 | biases.append(bias_i) 235 | 236 | head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] 237 | head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) 238 | 239 | out = hidden.new_empty((head_logit.size(0), self.n_token)) 240 | head_logprob = F.log_softmax(head_logit, dim=1) 241 | 242 | cutoff_values = [0] + self.cutoffs 243 | for i in range(len(cutoff_values) - 1): 244 | start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1] 245 | 246 | if i == 0: 247 | out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]] 248 | else: 249 | weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] 250 | 251 | tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i) 252 | tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) 253 | 254 | logprob_i = head_logprob[:, -i] + tail_logprob_i 255 | out[:, start_idx, stop_idx] = logprob_i 256 | 257 | return out 258 | 259 | 260 | class LogUniformSampler(object): 261 | def __init__(self, range_max, n_sample): 262 | """ 263 | Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py 264 | `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)` 265 | 266 | expected count can be approximated by 1 - (1 - p)^n 267 | and we use a numerically stable version -expm1(num_tries * log1p(-p)) 268 | 269 | Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run 270 | """ 271 | with torch.no_grad(): 272 | self.range_max = range_max 273 | log_indices = torch.arange(1., range_max+2., 1.).log_() 274 | self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1] 275 | # print('P', self.dist.numpy().tolist()[-30:]) 276 | 277 | self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float() 278 | 279 | self.n_sample = n_sample 280 | 281 | def sample(self, labels): 282 | """ 283 | labels: [b1, b2] 284 | Return 285 | true_log_probs: [b1, b2] 286 | samp_log_probs: [n_sample] 287 | neg_samples: [n_sample] 288 | """ 289 | 290 | # neg_samples = torch.empty(0).long() 291 | n_sample = self.n_sample 292 | n_tries = 2 * n_sample 293 | 294 | with torch.no_grad(): 295 | neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique() 296 | device = labels.device 297 | neg_samples = neg_samples.to(device) 298 | true_log_probs = self.log_q[labels].to(device) 299 | samp_log_probs = self.log_q[neg_samples].to(device) 300 | return true_log_probs, samp_log_probs, neg_samples 301 | 302 | def sample_logits(embedding, bias, labels, inputs, sampler): 303 | """ 304 | embedding: an nn.Embedding layer 305 | bias: [n_vocab] 306 | labels: [b1, b2] 307 | inputs: [b1, b2, n_emb] 308 | sampler: you may use a LogUniformSampler 309 | Return 310 | logits: [b1, b2, 1 + n_sample] 311 | """ 312 | true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels) 313 | n_sample = neg_samples.size(0) 314 | b1, b2 = labels.size(0), labels.size(1) 315 | all_ids = torch.cat([labels.view(-1), neg_samples]) 316 | all_w = embedding(all_ids) 317 | true_w = all_w[: -n_sample].view(b1, b2, -1) 318 | sample_w = all_w[- n_sample:].view(n_sample, -1) 319 | 320 | all_b = bias[all_ids] 321 | true_b = all_b[: -n_sample].view(b1, b2) 322 | sample_b = all_b[- n_sample:] 323 | 324 | hit = (labels[:, :, None] == neg_samples).detach() 325 | 326 | true_logits = torch.einsum('ijk,ijk->ij', 327 | [true_w, inputs]) + true_b - true_log_probs 328 | sample_logits = torch.einsum('lk,ijk->ijl', 329 | [sample_w, inputs]) + sample_b - samp_log_probs 330 | sample_logits.masked_fill_(hit, -1e30) 331 | logits = torch.cat([true_logits[:, :, None], sample_logits], -1) 332 | 333 | return logits 334 | 335 | 336 | # class LogUniformSampler(object): 337 | # def __init__(self, range_max, unique=False): 338 | # """ 339 | # Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py 340 | # `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)` 341 | # """ 342 | # self.range_max = range_max 343 | # log_indices = torch.arange(1., range_max+2., 1.).log_() 344 | # self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1] 345 | 346 | # self.unique = unique 347 | 348 | # if self.unique: 349 | # self.exclude_mask = torch.ByteTensor(range_max).fill_(0) 350 | 351 | # def sample(self, n_sample, labels): 352 | # pos_sample, new_labels = labels.unique(return_inverse=True) 353 | # n_pos_sample = pos_sample.size(0) 354 | # n_neg_sample = n_sample - n_pos_sample 355 | 356 | # if self.unique: 357 | # self.exclude_mask.index_fill_(0, pos_sample, 1) 358 | # sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0) 359 | # self.exclude_mask.index_fill_(0, pos_sample, 0) 360 | # else: 361 | # sample_dist = self.dist 362 | 363 | # neg_sample = torch.multinomial(sample_dist, n_neg_sample) 364 | 365 | # sample = torch.cat([pos_sample, neg_sample]) 366 | # sample_prob = self.dist[sample] 367 | 368 | # return new_labels, sample, sample_prob 369 | 370 | 371 | if __name__ == '__main__': 372 | S, B = 3, 4 373 | n_vocab = 10000 374 | n_sample = 5 375 | H = 32 376 | 377 | labels = torch.LongTensor(S, B).random_(0, n_vocab) 378 | 379 | # sampler = LogUniformSampler(n_vocab, unique=False) 380 | # new_labels, sample, sample_prob = sampler.sample(n_sample, labels) 381 | 382 | sampler = LogUniformSampler(n_vocab, n_sample)#, unique=True) 383 | # true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels) 384 | 385 | # print('true_probs', true_probs.numpy().tolist()) 386 | # print('samp_probs', samp_probs.numpy().tolist()) 387 | # print('neg_samples', neg_samples.numpy().tolist()) 388 | 389 | # print('sum', torch.sum(sampler.dist).item()) 390 | 391 | # assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item() 392 | 393 | embedding = nn.Embedding(n_vocab, H) 394 | bias = torch.zeros(n_vocab) 395 | inputs = torch.Tensor(S, B, H).normal_() 396 | 397 | logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample) 398 | print('logits', logits.detach().numpy().tolist()) 399 | print('logits shape', logits.size()) 400 | print('out_labels', out_labels.detach().numpy().tolist()) 401 | print('out_labels shape', out_labels.size()) 402 | 403 | -------------------------------------------------------------------------------- /sentence_similarity_Bert/examples/run_classifier_modify2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """BERT finetuning runner.""" 17 | 18 | from __future__ import absolute_import, division, print_function 19 | 20 | import argparse 21 | import csv 22 | import logging 23 | import os 24 | import random 25 | import pandas as pd 26 | import sys 27 | 28 | import re 29 | import numpy as np 30 | import torch 31 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 32 | TensorDataset) 33 | from torch.utils.data.distributed import DistributedSampler 34 | from tqdm import tqdm, trange 35 | 36 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE 37 | from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME 38 | from pytorch_pretrained_bert.tokenization import BertTokenizer 39 | from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear 40 | 41 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 42 | datefmt='%m/%d/%Y %H:%M:%S', 43 | level=logging.INFO) 44 | logger = logging.getLogger(__name__) 45 | 46 | 47 | class InputExample(object): 48 | """A single training/test example for simple sequence classification.""" 49 | 50 | def __init__(self, guid, text_a, text_b=None, label=None): 51 | """Constructs a InputExample. 52 | 53 | Args: 54 | guid: Unique id for the example. 55 | text_a: string. The untokenized text of the first sequence. For single 56 | sequence tasks, only this sequence must be specified. 57 | text_b: (Optional) string. The untokenized text of the second sequence. 58 | Only must be specified for sequence pair tasks. 59 | label: (Optional) string. The label of the example. This should be 60 | specified for train and dev examples, but not for test examples. 61 | """ 62 | self.guid = guid 63 | self.text_a = text_a 64 | self.text_b = text_b 65 | self.label = label 66 | 67 | 68 | class InputFeatures(object): 69 | """A single set of features of data.""" 70 | 71 | def __init__(self, input_ids, input_mask, segment_ids, label_id): 72 | self.input_ids = input_ids 73 | self.input_mask = input_mask 74 | self.segment_ids = segment_ids 75 | self.label_id = label_id 76 | 77 | 78 | class DataProcessor(object): 79 | """Base class for data converters for sequence classification data sets.""" 80 | 81 | def get_train_examples(self, data_dir): 82 | """Gets a collection of `InputExample`s for the train set.""" 83 | raise NotImplementedError() 84 | 85 | def get_dev_examples(self, data_dir): 86 | """Gets a collection of `InputExample`s for the dev set.""" 87 | raise NotImplementedError() 88 | 89 | def get_labels(self): 90 | """Gets the list of labels for this data set.""" 91 | raise NotImplementedError() 92 | 93 | 94 | #修改处,数据读取 95 | class SimProcessor(DataProcessor): 96 | 97 | def get_train_examples(self, data_dir): 98 | """See base class.""" 99 | logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train"))) 100 | 101 | file_path = os.path.join(data_dir, 'train.csv') 102 | train_df = pd.read_csv(file_path, encoding='utf-8') 103 | train_data = [] 104 | for index, train in enumerate(train_df.values): 105 | guid = 'train-%d' % index 106 | text_a = str(train[0]) 107 | text_b = str(train[1]) 108 | label = str(train[2]) 109 | train_data.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 110 | return train_data 111 | 112 | 113 | def get_dev_examples(self, data_dir): 114 | 115 | file_path = os.path.join(data_dir, 'dev.csv') 116 | dev_df = pd.read_csv(file_path, encoding='utf-8') 117 | dev_data = [] 118 | for index, dev in enumerate(dev_df.values): 119 | guid = 'test-%d' % index 120 | text_a = str(dev[0]) 121 | text_b = str(dev[1]) 122 | label = str(dev[2]) 123 | dev_data.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 124 | return dev_data 125 | #序号、sen1、sen2、类别 126 | 127 | 128 | 129 | #返回所有的类别 130 | def get_labels(self): 131 | """See base class.""" 132 | return ["0", "1"] 133 | 134 | 135 | def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): 136 | """Loads a data file into a list of `InputBatch`s.""" 137 | 138 | label_map = {label: i for i, label in enumerate(label_list)} 139 | features = [] 140 | for (ex_index, example) in enumerate(examples): 141 | tokens_a = tokenizer.tokenize(example.text_a) 142 | tokens_b = None 143 | if example.text_b: 144 | tokens_b = tokenizer.tokenize(example.text_b) 145 | _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) 146 | else: 147 | if len(tokens_a) > max_seq_length - 2: 148 | tokens_a = tokens_a[:(max_seq_length - 2)] 149 | tokens = ["[CLS]"] + tokens_a + ["[SEP]"] 150 | segment_ids = [0] * len(tokens) 151 | if tokens_b: 152 | tokens += tokens_b + ["[SEP]"] 153 | segment_ids += [1] * (len(tokens_b) + 1) 154 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 155 | input_mask = [1] * len(input_ids) 156 | padding = [0] * (max_seq_length - len(input_ids)) 157 | input_ids += padding 158 | input_mask += padding 159 | segment_ids += padding 160 | 161 | assert len(input_ids) == max_seq_length 162 | assert len(input_mask) == max_seq_length 163 | assert len(segment_ids) == max_seq_length 164 | 165 | label_id = label_map[example.label] 166 | if ex_index < 5: 167 | logger.info("*** Example ***") 168 | logger.info("guid: %s" % (example.guid)) 169 | logger.info("tokens: %s" % " ".join( 170 | [str(x) for x in tokens])) 171 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 172 | logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 173 | logger.info( 174 | "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 175 | logger.info("label: %s (id = %d)" % (example.label, label_id)) 176 | 177 | features.append( 178 | InputFeatures(input_ids=input_ids, 179 | input_mask=input_mask, 180 | segment_ids=segment_ids, 181 | label_id=label_id)) 182 | return features 183 | 184 | 185 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 186 | """Truncates a sequence pair in place to the maximum length.""" 187 | 188 | # This is a simple heuristic which will always truncate the longer sequence 189 | # one token at a time. This makes more sense than truncating an equal percent 190 | # of tokens from each, since if one sequence is very short then each token 191 | # that's truncated likely contains more information than a longer sequence. 192 | while True: 193 | total_length = len(tokens_a) + len(tokens_b) 194 | if total_length <= max_length: 195 | break 196 | if len(tokens_a) > len(tokens_b): 197 | tokens_a.pop() 198 | else: 199 | tokens_b.pop() 200 | 201 | 202 | def accuracy(out, labels): 203 | outputs = np.argmax(out, axis=1) 204 | return np.sum(outputs == labels) 205 | 206 | 207 | def main(): 208 | parser = argparse.ArgumentParser() 209 | 210 | ## Required parameters 211 | parser.add_argument("--data_dir", 212 | default=None, 213 | type=str, 214 | required=True, 215 | help="The input data dir. Should contain the .tsv files (or other data files) for the task.") 216 | parser.add_argument("--bert_model", default=None, type=str, required=True, 217 | help="Bert pre-trained model selected in the list: bert-base-uncased, " 218 | "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " 219 | "bert-base-multilingual-cased, bert-base-chinese.") 220 | parser.add_argument("--task_name", 221 | default=None, 222 | type=str, 223 | required=True, 224 | help="The name of the task to train.") 225 | parser.add_argument("--output_dir", 226 | default=None, 227 | type=str, 228 | required=True, 229 | help="The output directory where the model predictions and checkpoints will be written.") 230 | 231 | ## Other parameters 232 | parser.add_argument("--cache_dir", 233 | default="", 234 | type=str, 235 | help="Where do you want to store the pre-trained models downloaded from s3") 236 | parser.add_argument("--max_seq_length", 237 | default=128, 238 | type=int, 239 | help="The maximum total input sequence length after WordPiece tokenization. \n" 240 | "Sequences longer than this will be truncated, and sequences shorter \n" 241 | "than this will be padded.") 242 | parser.add_argument("--do_train", 243 | action='store_true', 244 | help="Whether to run training.") 245 | parser.add_argument("--do_eval", 246 | action='store_true', 247 | help="Whether to run eval on the dev set.") 248 | parser.add_argument("--do_lower_case", 249 | action='store_true', 250 | help="Set this flag if you are using an uncased model.") 251 | parser.add_argument("--train_batch_size", 252 | default=32, 253 | type=int, 254 | help="Total batch size for training.") 255 | parser.add_argument("--eval_batch_size", 256 | default=8, 257 | type=int, 258 | help="Total batch size for eval.") 259 | parser.add_argument("--learning_rate", 260 | default=5e-5, 261 | type=float, 262 | help="The initial learning rate for Adam.") 263 | parser.add_argument("--num_train_epochs", 264 | default=1.0, 265 | type=float, 266 | help="Total number of training epochs to perform.") 267 | parser.add_argument("--warmup_proportion", 268 | default=0.1, 269 | type=float, 270 | help="Proportion of training to perform linear learning rate warmup for. " 271 | "E.g., 0.1 = 10%% of training.") 272 | parser.add_argument("--no_cuda", 273 | action='store_true', 274 | help="Whether not to use CUDA when available") 275 | parser.add_argument("--local_rank", 276 | type=int, 277 | default=-1, 278 | help="local_rank for distributed training on gpus") 279 | parser.add_argument('--seed', 280 | type=int, 281 | default=42, 282 | help="random seed for initialization") 283 | parser.add_argument('--gradient_accumulation_steps', 284 | type=int, 285 | default=1, 286 | help="Number of updates steps to accumulate before performing a backward/update pass.") 287 | parser.add_argument('--fp16', 288 | action='store_true', 289 | help="Whether to use 16-bit float precision instead of 32-bit") 290 | parser.add_argument('--loss_scale', 291 | type=float, default=0, 292 | help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" 293 | "0 (default value): dynamic loss scaling.\n" 294 | "Positive power of 2: static loss scaling value.\n") 295 | parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") 296 | parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") 297 | args = parser.parse_args() 298 | 299 | if args.server_ip and args.server_port: 300 | # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script 301 | import ptvsd 302 | print("Waiting for debugger attach") 303 | ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) 304 | ptvsd.wait_for_attach() 305 | 306 | processors = { 307 | # "cola": ColaProcessor, 308 | # "mnli": MnliProcessor, 309 | "mrpc" : SimProcessor 310 | } 311 | 312 | num_labels_task = { 313 | "mrpc": 2, 314 | } 315 | 316 | if args.local_rank == -1 or args.no_cuda: 317 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 318 | n_gpu = torch.cuda.device_count() 319 | else: 320 | torch.cuda.set_device(args.local_rank) 321 | device = torch.device("cuda", args.local_rank) 322 | n_gpu = 1 323 | # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 324 | torch.distributed.init_process_group(backend='nccl') 325 | logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( 326 | device, n_gpu, bool(args.local_rank != -1), args.fp16)) 327 | 328 | if args.gradient_accumulation_steps < 1: 329 | raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( 330 | args.gradient_accumulation_steps)) 331 | 332 | args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps 333 | 334 | random.seed(args.seed) 335 | np.random.seed(args.seed) 336 | torch.manual_seed(args.seed) 337 | if n_gpu > 0: 338 | torch.cuda.manual_seed_all(args.seed) 339 | 340 | if not args.do_train and not args.do_eval: 341 | raise ValueError("At least one of `do_train` or `do_eval` must be True.") 342 | 343 | if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: 344 | raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) 345 | if not os.path.exists(args.output_dir): 346 | os.makedirs(args.output_dir) 347 | 348 | task_name = args.task_name.lower() 349 | 350 | if task_name not in processors: 351 | raise ValueError("Task not found: %s" % (task_name)) 352 | 353 | processor = processors[task_name]() 354 | num_labels = num_labels_task[task_name] 355 | label_list = processor.get_labels() 356 | 357 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 358 | 359 | train_examples = None 360 | num_train_optimization_steps = None 361 | if args.do_train: 362 | train_examples = processor.get_train_examples(args.data_dir) 363 | num_train_optimization_steps = int( 364 | len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs 365 | if args.local_rank != -1: 366 | num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() 367 | 368 | # Prepare model 369 | cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{0}') 370 | # cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(str(args.local_rank))) 371 | model = BertForSequenceClassification.from_pretrained(args.bert_model, 372 | cache_dir=cache_dir, 373 | num_labels=num_labels) 374 | if args.fp16: 375 | model.half() 376 | model.to(device) 377 | if args.local_rank != -1: 378 | try: 379 | from apex.parallel import DistributedDataParallel as DDP 380 | except ImportError: 381 | raise ImportError( 382 | "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") 383 | 384 | model = DDP(model) 385 | elif n_gpu > 1: 386 | model = torch.nn.DataParallel(model) 387 | 388 | # Prepare optimizer 389 | param_optimizer = list(model.named_parameters()) 390 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 391 | optimizer_grouped_parameters = [ 392 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, 393 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 394 | ] 395 | if args.fp16: 396 | try: 397 | from apex.optimizers import FP16_Optimizer 398 | from apex.optimizers import FusedAdam 399 | except ImportError: 400 | raise ImportError( 401 | "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") 402 | 403 | optimizer = FusedAdam(optimizer_grouped_parameters, 404 | lr=args.learning_rate, 405 | bias_correction=False, 406 | max_grad_norm=1.0) 407 | if args.loss_scale == 0: 408 | optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) 409 | else: 410 | optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) 411 | 412 | else: 413 | optimizer = BertAdam(optimizer_grouped_parameters, 414 | lr=args.learning_rate, 415 | warmup=args.warmup_proportion, 416 | t_total=num_train_optimization_steps) 417 | 418 | global_step = 0 419 | nb_tr_steps = 0 420 | tr_loss = 0 421 | if args.do_train: 422 | train_features = convert_examples_to_features( 423 | train_examples, label_list, args.max_seq_length, tokenizer) 424 | logger.info("***** Running training *****") 425 | logger.info(" Num examples = %d", len(train_examples)) 426 | logger.info(" Batch size = %d", args.train_batch_size) 427 | logger.info(" Num steps = %d", num_train_optimization_steps) 428 | all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) 429 | all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) 430 | all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) 431 | all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) 432 | train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) 433 | if args.local_rank == -1: 434 | train_sampler = RandomSampler(train_data) 435 | else: 436 | train_sampler = DistributedSampler(train_data) 437 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) 438 | 439 | model.train() 440 | for _ in trange(int(args.num_train_epochs), desc="Epoch"): 441 | tr_loss = 0 442 | nb_tr_examples, nb_tr_steps = 0, 0 443 | for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): 444 | batch = tuple(t.to(device) for t in batch) 445 | input_ids, input_mask, segment_ids, label_ids = batch 446 | loss = model(input_ids, segment_ids, input_mask, label_ids) 447 | if n_gpu > 1: 448 | loss = loss.mean() # mean() to average on multi-gpu. 449 | if args.gradient_accumulation_steps > 1: 450 | loss = loss / args.gradient_accumulation_steps 451 | 452 | if args.fp16: 453 | optimizer.backward(loss) 454 | else: 455 | loss.backward() 456 | 457 | tr_loss += loss.item() 458 | nb_tr_examples += input_ids.size(0) 459 | nb_tr_steps += 1 460 | if (step + 1) % args.gradient_accumulation_steps == 0: 461 | if args.fp16: 462 | # modify learning rate with special warm up BERT uses 463 | # if args.fp16 is False, BertAdam is used that handles this automatically 464 | lr_this_step = args.learning_rate * warmup_linear(global_step / num_train_optimization_steps, 465 | args.warmup_proportion) 466 | for param_group in optimizer.param_groups: 467 | param_group['lr'] = lr_this_step 468 | optimizer.step() 469 | optimizer.zero_grad() 470 | global_step += 1 471 | 472 | if args.do_train: 473 | # Save a trained model and the associated configuration 474 | model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self 475 | output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) 476 | torch.save(model_to_save.state_dict(), output_model_file) 477 | output_config_file = os.path.join(args.output_dir, CONFIG_NAME) 478 | with open(output_config_file, 'w') as f: 479 | f.write(model_to_save.config.to_json_string()) 480 | 481 | # Load a trained model and config that you have fine-tuned 482 | config = BertConfig(output_config_file) 483 | model = BertForSequenceClassification(config, num_labels=num_labels) 484 | model.load_state_dict(torch.load(output_model_file)) 485 | else: 486 | model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) 487 | model.to(device) 488 | 489 | if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): 490 | eval_examples = processor.get_dev_examples(args.data_dir) 491 | eval_features = convert_examples_to_features( 492 | eval_examples, label_list, args.max_seq_length, tokenizer) 493 | logger.info("***** Running evaluation *****") 494 | logger.info(" Num examples = %d", len(eval_examples)) 495 | logger.info(" Batch size = %d", args.eval_batch_size) 496 | all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) 497 | all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) 498 | all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) 499 | all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) 500 | eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) 501 | # Run prediction for full data 502 | eval_sampler = SequentialSampler(eval_data) 503 | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) 504 | 505 | model.eval() 506 | eval_loss, eval_accuracy = 0, 0 507 | nb_eval_steps, nb_eval_examples = 0, 0 508 | 509 | for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): 510 | input_ids = input_ids.to(device) 511 | input_mask = input_mask.to(device) 512 | segment_ids = segment_ids.to(device) 513 | label_ids = label_ids.to(device) 514 | 515 | with torch.no_grad(): 516 | tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) 517 | logits = model(input_ids, segment_ids, input_mask) 518 | 519 | logits = logits.detach().cpu().numpy() 520 | label_ids = label_ids.to('cpu').numpy() 521 | tmp_eval_accuracy = accuracy(logits, label_ids) 522 | 523 | eval_loss += tmp_eval_loss.mean().item() 524 | eval_accuracy += tmp_eval_accuracy 525 | 526 | nb_eval_examples += input_ids.size(0) 527 | nb_eval_steps += 1 528 | 529 | eval_loss = eval_loss / nb_eval_steps 530 | eval_accuracy = eval_accuracy / nb_eval_examples 531 | loss = tr_loss / nb_tr_steps if args.do_train else None 532 | result = {'eval_loss': eval_loss, 533 | 'eval_accuracy': eval_accuracy, 534 | 'global_step': global_step, 535 | 'loss': loss} 536 | 537 | output_eval_file = os.path.join(args.output_dir, "eval_results.txt") 538 | # with open(output_eval_file, "w") as writer: 539 | # logger.info("***** Eval results *****") 540 | # for key in sorted(result.keys()): 541 | # logger.info(" %s = %s", key, str(result[key])) 542 | # writer.write("%s = %s\n" % (key, str(result[key]))) 543 | 544 | 545 | if __name__ == "__main__": 546 | main() 547 | --------------------------------------------------------------------------------