├── sentence_similarity_Bert
    ├── pytorch_pretrained_bert
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── modeling.cpython-36.pyc
    │   │   ├── file_utils.cpython-36.pyc
    │   │   ├── modeling_gpt2.cpython-36.pyc
    │   │   ├── optimization.cpython-36.pyc
    │   │   ├── tokenization.cpython-36.pyc
    │   │   ├── modeling_openai.cpython-36.pyc
    │   │   ├── tokenization_gpt2.cpython-36.pyc
    │   │   ├── modeling_transfo_xl.cpython-36.pyc
    │   │   ├── optimization_openai.cpython-36.pyc
    │   │   ├── tokenization_openai.cpython-36.pyc
    │   │   ├── tokenization_transfo_xl.cpython-36.pyc
    │   │   └── modeling_transfo_xl_utilities.cpython-36.pyc
    │   ├── __init__.py
    │   ├── convert_tf_checkpoint_to_pytorch.py
    │   ├── convert_gpt2_checkpoint_to_pytorch.py
    │   ├── convert_openai_checkpoint_to_pytorch.py
    │   ├── __main__.py
    │   ├── convert_transfo_xl_checkpoint_to_pytorch.py
    │   ├── optimization_openai.py
    │   ├── optimization.py
    │   ├── file_utils.py
    │   ├── tokenization_gpt2.py
    │   ├── tokenization_openai.py
    │   ├── tokenization.py
    │   └── modeling_transfo_xl_utilities.py
    ├── examples
    │   ├── models
    │   │   └── chinese_L-12_H-768_A-12
    │   │   │   ├── bert_model.ckpt.index
    │   │   │   ├── bert_model.ckpt.meta
    │   │   │   └── bert_config.json
    │   ├── run_classifier_class.py
    │   ├── extract_features.py
    │   └── run_classifier_modify2.py
    ├── requirements.txt
    ├── tests
    │   ├── optimization_test.py
    │   ├── tokenization_openai_test.py
    │   ├── tokenization_transfo_xl_test.py
    │   ├── tokenization_test.py
    │   ├── modeling_gpt2_test.py
    │   ├── modeling_transfo_xl_test.py
    │   ├── modeling_openai_test.py
    │   └── modeling_test.py
    └── setup.py
└── README.md


/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_similarity_Bert/examples/models/chinese_L-12_H-768_A-12/bert_model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/examples/models/chinese_L-12_H-768_A-12/bert_model.ckpt.index


--------------------------------------------------------------------------------
/sentence_similarity_Bert/examples/models/chinese_L-12_H-768_A-12/bert_model.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/examples/models/chinese_L-12_H-768_A-12/bert_model.ckpt.meta


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/file_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/file_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_gpt2.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_gpt2.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/optimization.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/optimization.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_openai.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_openai.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization_gpt2.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization_gpt2.cpython-36.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bert_sentence_similarity
 2 | 
 3 | Bert预训练模型fine-tune计算文本相似度
 4 | 
 5 | 1) 运行 ./sentence_similarity_Bert/examples/run_classifier_modify2 进行fine-tune
 6 | 
 7 | 2) 训练数据集为蚂蚁金服文本匹配的数据 在chinese_data文件夹内
 8 | 
 9 | 3) 运行run_classifier_class进行测试
10 | 
11 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_transfo_xl.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_transfo_xl.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/optimization_openai.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/optimization_openai.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization_openai.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization_openai.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization_transfo_xl.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/tokenization_transfo_xl.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_similarity_Bert/requirements.txt:
--------------------------------------------------------------------------------
 1 | # PyTorch
 2 | torch>=0.4.1
 3 | # progress bars in model download and training scripts
 4 | tqdm
 5 | # Accessing files from S3 directly.
 6 | boto3
 7 | # Used for downloading models over HTTP
 8 | requests
 9 | # For OpenAI GPT
10 | regex


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_transfo_xl_utilities.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BonnieHuangxin/Bert_sentence_similarity/HEAD/sentence_similarity_Bert/pytorch_pretrained_bert/__pycache__/modeling_transfo_xl_utilities.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_similarity_Bert/examples/models/chinese_L-12_H-768_A-12/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1, 
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.1, 
 6 |   "hidden_size": 768, 
 7 |   "initializer_range": 0.02, 
 8 |   "intermediate_size": 3072, 
 9 |   "max_position_embeddings": 512, 
10 |   "num_attention_heads": 12, 
11 |   "num_hidden_layers": 12, 
12 |   "pooler_fc_size": 768, 
13 |   "pooler_num_attention_heads": 12, 
14 |   "pooler_num_fc_layers": 3, 
15 |   "pooler_size_per_head": 128, 
16 |   "pooler_type": "first_token_transform", 
17 |   "type_vocab_size": 2, 
18 |   "vocab_size": 21128
19 | }
20 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "0.6.1"
 2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 3 | from .tokenization_openai import OpenAIGPTTokenizer
 4 | from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 5 | from .tokenization_gpt2 import GPT2Tokenizer
 6 | 
 7 | from .modeling import (BertConfig, BertModel, BertForPreTraining,
 8 |                        BertForMaskedLM, BertForNextSentencePrediction,
 9 |                        BertForSequenceClassification, BertForMultipleChoice,
10 |                        BertForTokenClassification, BertForQuestionAnswering,
11 |                        load_tf_weights_in_bert)
12 | from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
13 |                               OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
14 |                               load_tf_weights_in_openai_gpt)
15 | from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
16 |                                   load_tf_weights_in_transfo_xl)
17 | from .modeling_gpt2 import (GPT2Config, GPT2Model,
18 |                             GPT2LMHeadModel, GPT2DoubleHeadsModel,
19 |                             load_tf_weights_in_gpt2)
20 | 
21 | from .optimization import BertAdam
22 | from .optimization_openai import OpenAIAdam
23 | 
24 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path
25 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/tests/optimization_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import unittest
20 | 
21 | import torch
22 | 
23 | from pytorch_pretrained_bert import BertAdam
24 | 
25 | class OptimizationTest(unittest.TestCase):
26 | 
27 |     def assertListAlmostEqual(self, list1, list2, tol):
28 |         self.assertEqual(len(list1), len(list2))
29 |         for a, b in zip(list1, list2):
30 |             self.assertAlmostEqual(a, b, delta=tol)
31 | 
32 |     def test_adam(self):
33 |         w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
34 |         target = torch.tensor([0.4, 0.2, -0.5])
35 |         criterion = torch.nn.MSELoss()
36 |         # No warmup, constant schedule, no gradient clipping
37 |         optimizer = BertAdam(params=[w], lr=2e-1,
38 |                                           weight_decay=0.0,
39 |                                           max_grad_norm=-1)
40 |         for _ in range(100):
41 |             loss = criterion(w, target)
42 |             loss.backward()
43 |             optimizer.step()
44 |             w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves.
45 |             w.grad.zero_()
46 |         self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     unittest.main()
51 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/tests/tokenization_openai_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import json
20 | 
21 | from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
22 | 
23 | 
24 | class OpenAIGPTTokenizationTest(unittest.TestCase):
25 | 
26 |     def test_full_tokenizer(self):
27 |         """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
28 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
29 |                  "w</w>", "r</w>", "t</w>",
30 |                  "lo", "low", "er</w>",
31 |                  "low</w>", "lowest</w>", "newer</w>", "wider</w>"]
32 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
33 |         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
34 |         with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
35 |             json.dump(vocab_tokens, fp)
36 |             vocab_file = fp.name
37 |         with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
38 |             fp.write("\n".join(merges))
39 |             merges_file = fp.name
40 | 
41 |         tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=["<unk>"])
42 |         os.remove(vocab_file)
43 |         os.remove(merges_file)
44 | 
45 |         text = "lower"
46 |         bpe_tokens = ["low", "er</w>"]
47 |         tokens = tokenizer.tokenize(text)
48 |         self.assertListEqual(tokens, bpe_tokens)
49 | 
50 |         input_tokens = tokens + ["<unk>"]
51 |         input_bpe_tokens = [14, 15, 20]
52 |         self.assertListEqual(
53 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
54 | 
55 | if __name__ == '__main__':
56 |     unittest.main()
57 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HugginFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import os
22 | import re
23 | import argparse
24 | import tensorflow as tf
25 | import torch
26 | import numpy as np
27 | 
28 | from pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert
29 | 
30 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
31 |     # Initialise PyTorch model
32 |     config = BertConfig.from_json_file(bert_config_file)
33 |     print("Building PyTorch model from configuration: {}".format(str(config)))
34 |     model = BertForPreTraining(config)
35 | 
36 |     # Load weights from tf checkpoint
37 |     load_tf_weights_in_bert(model, tf_checkpoint_path)
38 | 
39 |     # Save pytorch-model
40 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
41 |     torch.save(model.state_dict(), pytorch_dump_path)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     parser = argparse.ArgumentParser()
46 |     ## Required parameters
47 |     parser.add_argument("--tf_checkpoint_path",
48 |                         default = None,
49 |                         type = str,
50 |                         required = True,
51 |                         help = "Path the TensorFlow checkpoint path.")
52 |     parser.add_argument("--bert_config_file",
53 |                         default = None,
54 |                         type = str,
55 |                         required = True,
56 |                         help = "The config json file corresponding to the pre-trained BERT model. \n"
57 |                             "This specifies the model architecture.")
58 |     parser.add_argument("--pytorch_dump_path",
59 |                         default = None,
60 |                         type = str,
61 |                         required = True,
62 |                         help = "Path to the output PyTorch model.")
63 |     args = parser.parse_args()
64 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
65 |                                      args.bert_config_file,
66 |                                      args.pytorch_dump_path)
67 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py
 3 | 
 4 | To create the package for pypi.
 5 | 
 6 | 1. Change the version in __init__.py and setup.py.
 7 | 
 8 | 2. Commit these changes with the message: "Release: VERSION"
 9 | 
10 | 3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' "
11 |    Push the tag to git: git push --tags origin master
12 | 
13 | 4. Build both the sources and the wheel. Do not change anything in setup.py between
14 |    creating the wheel and the source distribution (obviously).
15 | 
16 |    For the wheel, run: "python setup.py bdist_wheel" in the top level allennlp directory.
17 |    (this will build a wheel for the python version you use to build it - make sure you use python 3.x).
18 | 
19 |    For the sources, run: "python setup.py sdist"
20 |    You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp.
21 | 
22 | 5. Check that everything looks correct by uploading the package to the pypi test server:
23 | 
24 |    twine upload dist/* -r pypitest
25 |    (pypi suggest using twine as other methods upload files via plaintext.)
26 | 
27 |    Check that you can install it in a virtualenv by running:
28 |    pip install -i https://testpypi.python.org/pypi allennlp
29 | 
30 | 6. Upload the final version to actual pypi:
31 |    twine upload dist/* -r pypi
32 | 
33 | 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
34 | 
35 | """
36 | from io import open
37 | from setuptools import find_packages, setup
38 | 
39 | setup(
40 |     name="pytorch_pretrained_bert",
41 |     version="0.6.1",
42 |     author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
43 |     author_email="thomas@huggingface.co",
44 |     description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
45 |     long_description=open("README.md", "r", encoding='utf-8').read(),
46 |     long_description_content_type="text/markdown",
47 |     keywords='BERT NLP deep learning google',
48 |     license='Apache',
49 |     url="https://github.com/huggingface/pytorch-pretrained-BERT",
50 |     packages=find_packages(exclude=["*.tests", "*.tests.*",
51 |                                     "tests.*", "tests"]),
52 |     install_requires=['torch>=0.4.1',
53 |                       'numpy',
54 |                       'boto3',
55 |                       'requests',
56 |                       'tqdm',
57 |                       'regex'],
58 |     entry_points={
59 |       'console_scripts': [
60 |         "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
61 |       ]
62 |     },
63 |     # python_requires='>=3.5.0',
64 |     tests_require=['pytest'],
65 |     classifiers=[
66 |           'Intended Audience :: Science/Research',
67 |           'License :: OSI Approved :: Apache Software License',
68 |           'Programming Language :: Python :: 3',
69 |           'Topic :: Scientific/Engineering :: Artificial Intelligence',
70 |     ],
71 | )
72 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HugginFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | from io import open
21 | 
22 | import torch
23 | 
24 | from pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
25 |                                                      GPT2Config,
26 |                                                      GPT2Model,
27 |                                                      load_tf_weights_in_gpt2)
28 | 
29 | 
30 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
31 |     # Construct model
32 |     if gpt2_config_file == "":
33 |         config = GPT2Config()
34 |     else:
35 |         config = GPT2Config(gpt2_config_file)
36 |     model = GPT2Model(config)
37 | 
38 |     # Load weights from numpy
39 |     load_tf_weights_in_gpt2(model, gpt2_checkpoint_path)
40 | 
41 |     # Save pytorch-model
42 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
43 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
44 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
45 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
46 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
47 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
48 |         f.write(config.to_json_string())
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     parser = argparse.ArgumentParser()
53 |     ## Required parameters
54 |     parser.add_argument("--gpt2_checkpoint_path",
55 |                         default = None,
56 |                         type = str,
57 |                         required = True,
58 |                         help = "Path the TensorFlow checkpoint path.")
59 |     parser.add_argument("--pytorch_dump_folder_path",
60 |                         default = None,
61 |                         type = str,
62 |                         required = True,
63 |                         help = "Path to the output PyTorch model.")
64 |     parser.add_argument("--gpt2_config_file",
65 |                         default = "",
66 |                         type = str,
67 |                         help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
68 |                             "This specifies the model architecture.")
69 |     args = parser.parse_args()
70 |     convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path,
71 |                                          args.gpt2_config_file,
72 |                                          args.pytorch_dump_folder_path)
73 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HugginFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | from io import open
21 | 
22 | import torch
23 | 
24 | from pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
25 |                                                      OpenAIGPTConfig,
26 |                                                      OpenAIGPTModel,
27 |                                                      load_tf_weights_in_openai_gpt)
28 | 
29 | 
30 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
31 |     # Construct model
32 |     if openai_config_file == "":
33 |         config = OpenAIGPTConfig()
34 |     else:
35 |         config = OpenAIGPTConfig(openai_config_file)
36 |     model = OpenAIGPTModel(config)
37 | 
38 |     # Load weights from numpy
39 |     load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path)
40 | 
41 |     # Save pytorch-model
42 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
43 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
44 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
45 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
46 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
47 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
48 |         f.write(config.to_json_string())
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     parser = argparse.ArgumentParser()
53 |     ## Required parameters
54 |     parser.add_argument("--openai_checkpoint_folder_path",
55 |                         default = None,
56 |                         type = str,
57 |                         required = True,
58 |                         help = "Path the TensorFlow checkpoint path.")
59 |     parser.add_argument("--pytorch_dump_folder_path",
60 |                         default = None,
61 |                         type = str,
62 |                         required = True,
63 |                         help = "Path to the output PyTorch model.")
64 |     parser.add_argument("--openai_config_file",
65 |                         default = "",
66 |                         type = str,
67 |                         help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
68 |                             "This specifies the model architecture.")
69 |     args = parser.parse_args()
70 |     convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
71 |                                          args.openai_config_file,
72 |                                          args.pytorch_dump_folder_path)
73 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/tests/tokenization_transfo_xl_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | from io import open
20 | 
21 | from pytorch_pretrained_bert.tokenization_transfo_xl import (TransfoXLTokenizer,
22 |                                                   _is_control, _is_punctuation,
23 |                                                   _is_whitespace)
24 | 
25 | 
26 | class TransfoXLTokenizationTest(unittest.TestCase):
27 | 
28 |     def test_full_tokenizer(self):
29 |         vocab_tokens = [
30 |             "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un", "running", ","
31 |         ]
32 |         with open("/tmp/transfo_xl_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
33 |             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
34 |             vocab_file = vocab_writer.name
35 | 
36 |         tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)
37 |         tokenizer.build_vocab()
38 |         os.remove(vocab_file)
39 | 
40 |         tokens = tokenizer.tokenize(u"<unk> UNwant\u00E9d,running")
41 |         self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
42 | 
43 |         self.assertListEqual(
44 |             tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
45 | 
46 |     def test_full_tokenizer_lower(self):
47 |         tokenizer = TransfoXLTokenizer(lower_case=True)
48 | 
49 |         self.assertListEqual(
50 |             tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
51 |             ["hello", "!", "how", "are", "you", "?"])
52 |         self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
53 | 
54 |     def test_full_tokenizer_no_lower(self):
55 |         tokenizer = TransfoXLTokenizer(lower_case=False)
56 | 
57 |         self.assertListEqual(
58 |             tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
59 |             ["HeLLo", "!", "how", "Are", "yoU", "?"])
60 | 
61 |     def test_is_whitespace(self):
62 |         self.assertTrue(_is_whitespace(u" "))
63 |         self.assertTrue(_is_whitespace(u"\t"))
64 |         self.assertTrue(_is_whitespace(u"\r"))
65 |         self.assertTrue(_is_whitespace(u"\n"))
66 |         self.assertTrue(_is_whitespace(u"\u00A0"))
67 | 
68 |         self.assertFalse(_is_whitespace(u"A"))
69 |         self.assertFalse(_is_whitespace(u"-"))
70 | 
71 |     def test_is_control(self):
72 |         self.assertTrue(_is_control(u"\u0005"))
73 | 
74 |         self.assertFalse(_is_control(u"A"))
75 |         self.assertFalse(_is_control(u" "))
76 |         self.assertFalse(_is_control(u"\t"))
77 |         self.assertFalse(_is_control(u"\r"))
78 | 
79 |     def test_is_punctuation(self):
80 |         self.assertTrue(_is_punctuation(u"-"))
81 |         self.assertTrue(_is_punctuation(u"$"))
82 |         self.assertTrue(_is_punctuation(u"`"))
83 |         self.assertTrue(_is_punctuation(u"."))
84 | 
85 |         self.assertFalse(_is_punctuation(u"A"))
86 |         self.assertFalse(_is_punctuation(u" "))
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     unittest.main()
91 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/__main__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | def main():
 3 |     import sys
 4 |     if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [
 5 |         "convert_tf_checkpoint_to_pytorch",
 6 |         "convert_openai_checkpoint",
 7 |         "convert_transfo_xl_checkpoint",
 8 |         "convert_gpt2_checkpoint",
 9 |     ]:
10 |         print(
11 |         "Should be used as one of: \n"
12 |         ">> `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
13 |         ">> `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n"
14 |         ">> `pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n"
15 |         ">> `pytorch_pretrained_bert convert_gpt2_checkpoint TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]`")
16 |     else:
17 |         if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
18 |             try:
19 |                 from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
20 |             except ImportError:
21 |                 print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
22 |                     "In that case, it requires TensorFlow to be installed. Please see "
23 |                     "https://www.tensorflow.org/install/ for installation instructions.")
24 |                 raise
25 | 
26 |             if len(sys.argv) != 5:
27 |                 # pylint: disable=line-too-long
28 |                 print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
29 |             else:
30 |                 PYTORCH_DUMP_OUTPUT = sys.argv.pop()
31 |                 TF_CONFIG = sys.argv.pop()
32 |                 TF_CHECKPOINT = sys.argv.pop()
33 |                 convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
34 |         elif sys.argv[1] == "convert_openai_checkpoint":
35 |             from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
36 |             OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
37 |             PYTORCH_DUMP_OUTPUT = sys.argv[3]
38 |             if len(sys.argv) == 5:
39 |                 OPENAI_GPT_CONFIG = sys.argv[4]
40 |             else:
41 |                 OPENAI_GPT_CONFIG = ""
42 |             convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
43 |                                                  OPENAI_GPT_CONFIG,
44 |                                                  PYTORCH_DUMP_OUTPUT)
45 |         elif sys.argv[1] == "convert_transfo_xl_checkpoint":
46 |             try:
47 |                 from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
48 |             except ImportError:
49 |                 print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
50 |                     "In that case, it requires TensorFlow to be installed. Please see "
51 |                     "https://www.tensorflow.org/install/ for installation instructions.")
52 |                 raise
53 | 
54 |             if 'ckpt' in sys.argv[2].lower():
55 |                 TF_CHECKPOINT = sys.argv[2]
56 |                 TF_DATASET_FILE = ""
57 |             else:
58 |                 TF_DATASET_FILE = sys.argv[2]
59 |                 TF_CHECKPOINT = ""
60 |             PYTORCH_DUMP_OUTPUT = sys.argv[3]
61 |             if len(sys.argv) == 5:
62 |                 TF_CONFIG = sys.argv[4]
63 |             else:
64 |                 TF_CONFIG = ""
65 |             convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
66 |         else:
67 |             try:
68 |                 from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
69 |             except ImportError:
70 |                 print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
71 |                     "In that case, it requires TensorFlow to be installed. Please see "
72 |                     "https://www.tensorflow.org/install/ for installation instructions.")
73 |                 raise
74 | 
75 |             TF_CHECKPOINT = sys.argv[2]
76 |             PYTORCH_DUMP_OUTPUT = sys.argv[3]
77 |             if len(sys.argv) == 5:
78 |                 TF_CONFIG = sys.argv[4]
79 |             else:
80 |                 TF_CONFIG = ""
81 |             convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
82 | if __name__ == '__main__':
83 |     main()
84 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/tests/tokenization_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import, division, print_function, unicode_literals
 16 | 
 17 | import os
 18 | import unittest
 19 | from io import open
 20 | 
 21 | from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
 22 |                                                   BertTokenizer,
 23 |                                                   WordpieceTokenizer,
 24 |                                                   _is_control, _is_punctuation,
 25 |                                                   _is_whitespace)
 26 | 
 27 | 
 28 | class TokenizationTest(unittest.TestCase):
 29 | 
 30 |     def test_full_tokenizer(self):
 31 |         vocab_tokens = [
 32 |             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
 33 |             "##ing", ","
 34 |         ]
 35 |         with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
 36 |             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 37 | 
 38 |             vocab_file = vocab_writer.name
 39 | 
 40 |         tokenizer = BertTokenizer(vocab_file)
 41 |         os.remove(vocab_file)
 42 | 
 43 |         tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
 44 |         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
 45 | 
 46 |         self.assertListEqual(
 47 |             tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 48 | 
 49 |     def test_full_tokenizer_raises_error_for_long_sequences(self):
 50 |         vocab_tokens = [
 51 |             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
 52 |             "##ing", ","
 53 |         ]
 54 |         with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
 55 |             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 56 |             vocab_file = vocab_writer.name
 57 | 
 58 |         tokenizer = BertTokenizer(vocab_file, max_len=10)
 59 |         os.remove(vocab_file)
 60 |         tokens = tokenizer.tokenize(u"the cat sat on the mat in the summer time")
 61 |         indices = tokenizer.convert_tokens_to_ids(tokens)
 62 |         self.assertListEqual(indices, [0 for _ in range(10)])
 63 | 
 64 |         tokens = tokenizer.tokenize(u"the cat sat on the mat in the summer time .")
 65 |         self.assertRaises(ValueError, tokenizer.convert_tokens_to_ids, tokens)
 66 | 
 67 |     def test_chinese(self):
 68 |         tokenizer = BasicTokenizer()
 69 | 
 70 |         self.assertListEqual(
 71 |             tokenizer.tokenize(u"ah\u535A\u63A8zz"),
 72 |             [u"ah", u"\u535A", u"\u63A8", u"zz"])
 73 | 
 74 |     def test_basic_tokenizer_lower(self):
 75 |         tokenizer = BasicTokenizer(do_lower_case=True)
 76 | 
 77 |         self.assertListEqual(
 78 |             tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
 79 |             ["hello", "!", "how", "are", "you", "?"])
 80 |         self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
 81 | 
 82 |     def test_basic_tokenizer_no_lower(self):
 83 |         tokenizer = BasicTokenizer(do_lower_case=False)
 84 | 
 85 |         self.assertListEqual(
 86 |             tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
 87 |             ["HeLLo", "!", "how", "Are", "yoU", "?"])
 88 | 
 89 |     def test_wordpiece_tokenizer(self):
 90 |         vocab_tokens = [
 91 |             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
 92 |             "##ing"
 93 |         ]
 94 | 
 95 |         vocab = {}
 96 |         for (i, token) in enumerate(vocab_tokens):
 97 |             vocab[token] = i
 98 |         tokenizer = WordpieceTokenizer(vocab=vocab)
 99 | 
100 |         self.assertListEqual(tokenizer.tokenize(""), [])
101 | 
102 |         self.assertListEqual(
103 |             tokenizer.tokenize("unwanted running"),
104 |             ["un", "##want", "##ed", "runn", "##ing"])
105 | 
106 |         self.assertListEqual(
107 |             tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
108 | 
109 |     def test_is_whitespace(self):
110 |         self.assertTrue(_is_whitespace(u" "))
111 |         self.assertTrue(_is_whitespace(u"\t"))
112 |         self.assertTrue(_is_whitespace(u"\r"))
113 |         self.assertTrue(_is_whitespace(u"\n"))
114 |         self.assertTrue(_is_whitespace(u"\u00A0"))
115 | 
116 |         self.assertFalse(_is_whitespace(u"A"))
117 |         self.assertFalse(_is_whitespace(u"-"))
118 | 
119 |     def test_is_control(self):
120 |         self.assertTrue(_is_control(u"\u0005"))
121 | 
122 |         self.assertFalse(_is_control(u"A"))
123 |         self.assertFalse(_is_control(u" "))
124 |         self.assertFalse(_is_control(u"\t"))
125 |         self.assertFalse(_is_control(u"\r"))
126 | 
127 |     def test_is_punctuation(self):
128 |         self.assertTrue(_is_punctuation(u"-"))
129 |         self.assertTrue(_is_punctuation(u"$"))
130 |         self.assertTrue(_is_punctuation(u"`"))
131 |         self.assertTrue(_is_punctuation(u"."))
132 | 
133 |         self.assertFalse(_is_punctuation(u"A"))
134 |         self.assertFalse(_is_punctuation(u" "))
135 | 
136 | 
137 | if __name__ == '__main__':
138 |     unittest.main()
139 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert Transformer XL checkpoint and datasets."""
 16 | 
 17 | from __future__ import absolute_import, division, print_function
 18 | 
 19 | import argparse
 20 | import os
 21 | import sys
 22 | from io import open
 23 | 
 24 | import torch
 25 | 
 26 | import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils
 27 | from pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME,
 28 |                                                          WEIGHTS_NAME,
 29 |                                                          TransfoXLConfig,
 30 |                                                          TransfoXLLMHeadModel,
 31 |                                                          load_tf_weights_in_transfo_xl)
 32 | from pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME,
 33 |                                                              VOCAB_NAME)
 34 | 
 35 | if sys.version_info[0] == 2:
 36 |     import cPickle as pickle
 37 | else:
 38 |     import pickle
 39 | 
 40 | # We do this to be able to load python 2 datasets pickles
 41 | # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
 42 | data_utils.Vocab = data_utils.TransfoXLTokenizer
 43 | data_utils.Corpus = data_utils.TransfoXLCorpus
 44 | sys.modules['data_utils'] = data_utils
 45 | sys.modules['vocabulary'] = data_utils
 46 | 
 47 | def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
 48 |                                              transfo_xl_config_file,
 49 |                                              pytorch_dump_folder_path,
 50 |                                              transfo_xl_dataset_file):
 51 |     if transfo_xl_dataset_file:
 52 |         # Convert a pre-processed corpus (see original TensorFlow repo)
 53 |         with open(transfo_xl_dataset_file, "rb") as fp:
 54 |             corpus = pickle.load(fp, encoding="latin1")
 55 |         # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
 56 |         pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME
 57 |         print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
 58 |         corpus_vocab_dict = corpus.vocab.__dict__
 59 |         torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
 60 | 
 61 |         corpus_dict_no_vocab = corpus.__dict__
 62 |         corpus_dict_no_vocab.pop('vocab', None)
 63 |         pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME
 64 |         print("Save dataset to {}".format(pytorch_dataset_dump_path))
 65 |         torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
 66 | 
 67 |     if tf_checkpoint_path:
 68 |         # Convert a pre-trained TensorFlow model
 69 |         config_path = os.path.abspath(transfo_xl_config_file)
 70 |         tf_path = os.path.abspath(tf_checkpoint_path)
 71 | 
 72 |         print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
 73 |         # Initialise PyTorch model
 74 |         if transfo_xl_config_file == "":
 75 |             config = TransfoXLConfig()
 76 |         else:
 77 |             config = TransfoXLConfig(transfo_xl_config_file)
 78 |         print("Building PyTorch model from configuration: {}".format(str(config)))
 79 |         model = TransfoXLLMHeadModel(config)
 80 | 
 81 |         model = load_tf_weights_in_transfo_xl(model, config, tf_path)
 82 |         # Save pytorch-model
 83 |         pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
 84 |         pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
 85 |         print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
 86 |         torch.save(model.state_dict(), pytorch_weights_dump_path)
 87 |         print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
 88 |         with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
 89 |             f.write(config.to_json_string())
 90 | 
 91 | 
 92 | if __name__ == "__main__":
 93 |     parser = argparse.ArgumentParser()
 94 |     parser.add_argument("--pytorch_dump_folder_path",
 95 |                         default = None,
 96 |                         type = str,
 97 |                         required = True,
 98 |                         help = "Path to the folder to store the PyTorch model or dataset/vocab.")
 99 |     parser.add_argument("--tf_checkpoint_path",
100 |                         default = "",
101 |                         type = str,
102 |                         help = "An optional path to a TensorFlow checkpoint path to be converted.")
103 |     parser.add_argument("--transfo_xl_config_file",
104 |                         default = "",
105 |                         type = str,
106 |                         help = "An optional config json file corresponding to the pre-trained BERT model. \n"
107 |                             "This specifies the model architecture.")
108 |     parser.add_argument("--transfo_xl_dataset_file",
109 |                         default = "",
110 |                         type = str,
111 |                         help = "An optional dataset file to be converted in a vocabulary.")
112 |     args = parser.parse_args()
113 |     convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
114 |                                      args.transfo_xl_config_file,
115 |                                      args.pytorch_dump_folder_path,
116 |                                      args.transfo_xl_dataset_file)
117 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/optimization_openai.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """PyTorch optimization for OpenAI GPT model."""
 16 | 
 17 | import math
 18 | import torch
 19 | from torch.optim import Optimizer
 20 | from torch.optim.optimizer import required
 21 | from torch.nn.utils import clip_grad_norm_
 22 | 
 23 | def warmup_cosine(x, warmup=0.002):
 24 |     s = 1 if x <= warmup else 0
 25 |     return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x)))
 26 | 
 27 | def warmup_constant(x, warmup=0.002):
 28 |     s = 1 if x <= warmup else 0
 29 |     return s*(x/warmup) + (1-s)*1
 30 | 
 31 | def warmup_linear(x, warmup=0.002):
 32 |     s = 1 if x <= warmup else 0
 33 |     return (s*(x/warmup) + (1-s))*(1-x)
 34 | 
 35 | SCHEDULES = {
 36 |     'warmup_cosine':warmup_cosine,
 37 |     'warmup_constant':warmup_constant,
 38 |     'warmup_linear':warmup_linear,
 39 | }
 40 | 
 41 | 
 42 | class OpenAIAdam(Optimizer):
 43 |     """Implements Open AI version of Adam algorithm with weight decay fix.
 44 |     """
 45 |     def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1,
 46 |                  b1=0.9, b2=0.999, e=1e-8, weight_decay=0,
 47 |                  vector_l2=False, max_grad_norm=-1, **kwargs):
 48 |         if lr is not required and lr < 0.0:
 49 |             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
 50 |         if schedule not in SCHEDULES:
 51 |             raise ValueError("Invalid schedule parameter: {}".format(schedule))
 52 |         if not 0.0 <= warmup < 1.0 and not warmup == -1:
 53 |             raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
 54 |         if not 0.0 <= b1 < 1.0:
 55 |             raise ValueError("Invalid b1 parameter: {}".format(b1))
 56 |         if not 0.0 <= b2 < 1.0:
 57 |             raise ValueError("Invalid b2 parameter: {}".format(b2))
 58 |         if not e >= 0.0:
 59 |             raise ValueError("Invalid epsilon value: {}".format(e))
 60 |         defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
 61 |                         b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
 62 |                         max_grad_norm=max_grad_norm)
 63 |         super(OpenAIAdam, self).__init__(params, defaults)
 64 | 
 65 |     def get_lr(self):
 66 |         lr = []
 67 |         for group in self.param_groups:
 68 |             for p in group['params']:
 69 |                 state = self.state[p]
 70 |                 if len(state) == 0:
 71 |                     return [0]
 72 |                 if group['t_total'] != -1:
 73 |                     schedule_fct = SCHEDULES[group['schedule']]
 74 |                     lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
 75 |                 else:
 76 |                     lr_scheduled = group['lr']
 77 |                 lr.append(lr_scheduled)
 78 |         return lr
 79 | 
 80 |     def step(self, closure=None):
 81 |         """Performs a single optimization step.
 82 | 
 83 |         Arguments:
 84 |             closure (callable, optional): A closure that reevaluates the model
 85 |                 and returns the loss.
 86 |         """
 87 |         loss = None
 88 |         if closure is not None:
 89 |             loss = closure()
 90 | 
 91 |         for group in self.param_groups:
 92 |             for p in group['params']:
 93 |                 if p.grad is None:
 94 |                     continue
 95 |                 grad = p.grad.data
 96 |                 if grad.is_sparse:
 97 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
 98 | 
 99 |                 state = self.state[p]
100 | 
101 |                 # State initialization
102 |                 if len(state) == 0:
103 |                     state['step'] = 0
104 |                     # Exponential moving average of gradient values
105 |                     state['exp_avg'] = torch.zeros_like(p.data)
106 |                     # Exponential moving average of squared gradient values
107 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
108 | 
109 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
110 |                 beta1, beta2 = group['b1'], group['b2']
111 | 
112 |                 state['step'] += 1
113 | 
114 |                 # Add grad clipping
115 |                 if group['max_grad_norm'] > 0:
116 |                     clip_grad_norm_(p, group['max_grad_norm'])
117 | 
118 |                 # Decay the first and second moment running average coefficient
119 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
120 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
121 |                 denom = exp_avg_sq.sqrt().add_(group['e'])
122 | 
123 |                 bias_correction1 = 1 - beta1 ** state['step']
124 |                 bias_correction2 = 1 - beta2 ** state['step']
125 | 
126 |                 if group['t_total'] != -1:
127 |                     schedule_fct = SCHEDULES[group['schedule']]
128 |                     lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
129 |                 else:
130 |                     lr_scheduled = group['lr']
131 | 
132 |                 step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
133 | 
134 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
135 | 
136 |                 # Add weight decay at the end (fixed version)
137 |                 if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0:
138 |                     p.data.add_(-lr_scheduled * group['weight_decay'], p.data)
139 | 
140 |         return loss
141 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """PyTorch optimization for BERT model."""
 16 | 
 17 | import math
 18 | import torch
 19 | from torch.optim import Optimizer
 20 | from torch.optim.optimizer import required
 21 | from torch.nn.utils import clip_grad_norm_
 22 | 
 23 | def warmup_cosine(x, warmup=0.002):
 24 |     if x < warmup:
 25 |         return x/warmup
 26 |     return 0.5 * (1.0 + torch.cos(math.pi * x))
 27 | 
 28 | def warmup_constant(x, warmup=0.002):
 29 |     if x < warmup:
 30 |         return x/warmup
 31 |     return 1.0
 32 | 
 33 | def warmup_linear(x, warmup=0.002):
 34 |     if x < warmup:
 35 |         return x/warmup
 36 |     return 1.0 - x
 37 | 
 38 | SCHEDULES = {
 39 |     'warmup_cosine':warmup_cosine,
 40 |     'warmup_constant':warmup_constant,
 41 |     'warmup_linear':warmup_linear,
 42 | }
 43 | 
 44 | 
 45 | class BertAdam(Optimizer):
 46 |     """Implements BERT version of Adam algorithm with weight decay fix.
 47 |     Params:
 48 |         lr: learning rate
 49 |         warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
 50 |         t_total: total number of training steps for the learning
 51 |             rate schedule, -1  means constant learning rate. Default: -1
 52 |         schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
 53 |         b1: Adams b1. Default: 0.9
 54 |         b2: Adams b2. Default: 0.999
 55 |         e: Adams epsilon. Default: 1e-6
 56 |         weight_decay: Weight decay. Default: 0.01
 57 |         max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
 58 |     """
 59 |     def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
 60 |                  b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
 61 |                  max_grad_norm=1.0):
 62 |         if lr is not required and lr < 0.0:
 63 |             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
 64 |         if schedule not in SCHEDULES:
 65 |             raise ValueError("Invalid schedule parameter: {}".format(schedule))
 66 |         if not 0.0 <= warmup < 1.0 and not warmup == -1:
 67 |             raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
 68 |         if not 0.0 <= b1 < 1.0:
 69 |             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
 70 |         if not 0.0 <= b2 < 1.0:
 71 |             raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
 72 |         if not e >= 0.0:
 73 |             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
 74 |         defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
 75 |                         b1=b1, b2=b2, e=e, weight_decay=weight_decay,
 76 |                         max_grad_norm=max_grad_norm)
 77 |         super(BertAdam, self).__init__(params, defaults)
 78 | 
 79 |     def get_lr(self):
 80 |         lr = []
 81 |         for group in self.param_groups:
 82 |             for p in group['params']:
 83 |                 state = self.state[p]
 84 |                 if len(state) == 0:
 85 |                     return [0]
 86 |                 if group['t_total'] != -1:
 87 |                     schedule_fct = SCHEDULES[group['schedule']]
 88 |                     lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
 89 |                 else:
 90 |                     lr_scheduled = group['lr']
 91 |                 lr.append(lr_scheduled)
 92 |         return lr
 93 | 
 94 |     def step(self, closure=None):
 95 |         """Performs a single optimization step.
 96 | 
 97 |         Arguments:
 98 |             closure (callable, optional): A closure that reevaluates the model
 99 |                 and returns the loss.
100 |         """
101 |         loss = None
102 |         if closure is not None:
103 |             loss = closure()
104 | 
105 |         for group in self.param_groups:
106 |             for p in group['params']:
107 |                 if p.grad is None:
108 |                     continue
109 |                 grad = p.grad.data
110 |                 if grad.is_sparse:
111 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
112 | 
113 |                 state = self.state[p]
114 | 
115 |                 # State initialization
116 |                 if len(state) == 0:
117 |                     state['step'] = 0
118 |                     # Exponential moving average of gradient values
119 |                     state['next_m'] = torch.zeros_like(p.data)
120 |                     # Exponential moving average of squared gradient values
121 |                     state['next_v'] = torch.zeros_like(p.data)
122 | 
123 |                 next_m, next_v = state['next_m'], state['next_v']
124 |                 beta1, beta2 = group['b1'], group['b2']
125 | 
126 |                 # Add grad clipping
127 |                 if group['max_grad_norm'] > 0:
128 |                     clip_grad_norm_(p, group['max_grad_norm'])
129 | 
130 |                 # Decay the first and second moment running average coefficient
131 |                 # In-place operations to update the averages at the same time
132 |                 next_m.mul_(beta1).add_(1 - beta1, grad)
133 |                 next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
134 |                 update = next_m / (next_v.sqrt() + group['e'])
135 | 
136 |                 # Just adding the square of the weights to the loss function is *not*
137 |                 # the correct way of using L2 regularization/weight decay with Adam,
138 |                 # since that will interact with the m and v parameters in strange ways.
139 |                 #
140 |                 # Instead we want to decay the weights in a manner that doesn't interact
141 |                 # with the m/v parameters. This is equivalent to adding the square
142 |                 # of the weights to the loss with plain (non-momentum) SGD.
143 |                 if group['weight_decay'] > 0.0:
144 |                     update += group['weight_decay'] * p.data
145 | 
146 |                 if group['t_total'] != -1:
147 |                     schedule_fct = SCHEDULES[group['schedule']]
148 |                     lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
149 |                 else:
150 |                     lr_scheduled = group['lr']
151 | 
152 |                 update_with_lr = lr_scheduled * update
153 |                 p.data.add_(-update_with_lr)
154 | 
155 |                 state['step'] += 1
156 | 
157 |                 # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
158 |                 # No bias correction
159 |                 # bias_correction1 = 1 - beta1 ** state['step']
160 |                 # bias_correction2 = 1 - beta2 ** state['step']
161 | 
162 |         return loss
163 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/tests/modeling_gpt2_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import
 16 | from __future__ import division
 17 | from __future__ import print_function
 18 | 
 19 | import unittest
 20 | import json
 21 | import random
 22 | 
 23 | import torch
 24 | 
 25 | from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
 26 |                                      GPT2LMHeadModel, GPT2DoubleHeadsModel)
 27 | 
 28 | 
 29 | class GPT2ModelTest(unittest.TestCase):
 30 |     class GPT2ModelTester(object):
 31 | 
 32 |         def __init__(self,
 33 |                      parent,
 34 |                      batch_size=13,
 35 |                      seq_length=7,
 36 |                      is_training=True,
 37 |                      use_position_ids=True,
 38 |                      use_token_type_ids=True,
 39 |                      use_labels=True,
 40 |                      vocab_size=99,
 41 |                      n_positions=33,
 42 |                      n_embd=32,
 43 |                      n_layer=5,
 44 |                      n_head=4,
 45 |                      n_choices=3,
 46 |                      type_sequence_label_size=2,
 47 |                      initializer_range=0.02,
 48 |                      num_labels=3,
 49 |                      scope=None):
 50 |             self.parent = parent
 51 |             self.batch_size = batch_size
 52 |             self.seq_length = seq_length
 53 |             self.is_training = is_training
 54 |             self.use_position_ids = use_position_ids
 55 |             self.use_token_type_ids = use_token_type_ids
 56 |             self.use_labels = use_labels
 57 |             self.vocab_size = vocab_size
 58 |             self.n_positions = n_positions
 59 |             self.n_embd = n_embd
 60 |             self.n_layer = n_layer
 61 |             self.n_head = n_head
 62 |             self.n_choices = n_choices
 63 |             self.type_sequence_label_size = type_sequence_label_size
 64 |             self.initializer_range = initializer_range
 65 |             self.num_labels = num_labels
 66 |             self.scope = scope
 67 | 
 68 |         def prepare_config_and_inputs(self):
 69 |             input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
 70 | 
 71 |             position_ids = None
 72 |             if self.use_position_ids:
 73 |                 position_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
 74 | 
 75 |             token_type_ids = None
 76 |             if self.use_token_type_ids:
 77 |                 total_voc = self.vocab_size
 78 |                 token_type_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
 79 | 
 80 |             mc_labels = None
 81 |             lm_labels = None
 82 |             mc_token_ids = None
 83 |             if self.use_labels:
 84 |                 mc_labels = GPT2ModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
 85 |                 lm_labels = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
 86 |                 mc_token_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length)
 87 | 
 88 |             config = GPT2Config(
 89 |                 vocab_size_or_config_json_file=self.vocab_size,
 90 |                 n_positions=self.n_positions,
 91 |                 n_embd=self.n_embd,
 92 |                 n_layer=self.n_layer,
 93 |                 n_head=self.n_head,
 94 |                 initializer_range=self.initializer_range)
 95 | 
 96 |             return (config, input_ids, token_type_ids, position_ids,
 97 |                     mc_labels, lm_labels, mc_token_ids)
 98 | 
 99 |         def create_gpt2_model(self, config, input_ids, token_type_ids, position_ids,
100 |                                 mc_labels, lm_labels, mc_token_ids):
101 |             model = GPT2Model(config)
102 |             model.eval()
103 |             hidden_states, presents = model(input_ids, position_ids, token_type_ids)
104 |             outputs = {
105 |                 "hidden_states": hidden_states,
106 |                 "presents": presents,
107 |             }
108 |             return outputs
109 | 
110 |         def check_gpt2_model_output(self, result):
111 |             self.parent.assertListEqual(
112 |                 list(result["hidden_states"].size()),
113 |                 [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
114 | 
115 | 
116 |         def create_gpt2_lm_head(self, config, input_ids, token_type_ids, position_ids,
117 |                                        mc_labels, lm_labels, mc_token_ids):
118 |             model = GPT2LMHeadModel(config)
119 |             model.eval()
120 |             loss = model(input_ids, position_ids, token_type_ids, lm_labels)
121 |             lm_logits, presents = model(input_ids, position_ids, token_type_ids)
122 |             outputs = {
123 |                 "loss": loss,
124 |                 "lm_logits": lm_logits,
125 |                 "presents": presents,
126 |             }
127 |             return outputs
128 | 
129 |         def check_gpt2_lm_head_output(self, result):
130 |             total_voc = self.vocab_size
131 |             self.parent.assertListEqual(
132 |                 list(result["lm_logits"].size()),
133 |                 [self.batch_size, self.n_choices, self.seq_length, total_voc])
134 | 
135 |         def check_gpt2_lm_head_loss_output(self, result):
136 |             self.parent.assertListEqual(
137 |                 list(result["loss"].size()),
138 |                 [])
139 | 
140 |         def create_gpt2_double_heads(self, config, input_ids, token_type_ids, position_ids,
141 |                                        mc_labels, lm_labels, mc_token_ids):
142 |             model = GPT2DoubleHeadsModel(config)
143 |             model.eval()
144 |             loss = model(input_ids, mc_token_ids,
145 |                          lm_labels=lm_labels, mc_labels=mc_labels,
146 |                          token_type_ids=token_type_ids, position_ids=position_ids)
147 |             lm_logits, mc_logits, presents = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
148 |             outputs = {
149 |                 "loss": loss,
150 |                 "lm_logits": lm_logits,
151 |                 "mc_logits": mc_logits,
152 |                 "presents": presents,
153 |             }
154 |             return outputs
155 | 
156 |         def check_gpt2_double_heads_output(self, result):
157 |             total_voc = self.vocab_size
158 |             self.parent.assertListEqual(
159 |                 list(result["lm_logits"].size()),
160 |                 [self.batch_size, self.n_choices, self.seq_length, total_voc])
161 |             self.parent.assertListEqual(
162 |                 list(result["mc_logits"].size()),
163 |                 [self.batch_size, self.n_choices])
164 | 
165 |         def check_gpt2_double_heads_loss_output(self, result):
166 |             self.parent.assertListEqual(
167 |                 [list(l.size()) for l in result["loss"]],
168 |                 [[], []])
169 | 
170 |     def test_default(self):
171 |         self.run_tester(GPT2ModelTest.GPT2ModelTester(self))
172 | 
173 |     def test_config_to_json_string(self):
174 |         config = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37)
175 |         obj = json.loads(config.to_json_string())
176 |         self.assertEqual(obj["vocab_size"], 99)
177 |         self.assertEqual(obj["n_embd"], 37)
178 | 
179 |     def run_tester(self, tester):
180 |         config_and_inputs = tester.prepare_config_and_inputs()
181 |         output_result = tester.create_gpt2_model(*config_and_inputs)
182 |         tester.check_gpt2_model_output(output_result)
183 | 
184 |         output_result = tester.create_gpt2_lm_head(*config_and_inputs)
185 |         tester.check_gpt2_lm_head_output(output_result)
186 |         tester.check_gpt2_lm_head_loss_output(output_result)
187 | 
188 |         output_result = tester.create_gpt2_double_heads(*config_and_inputs)
189 |         tester.check_gpt2_double_heads_output(output_result)
190 |         tester.check_gpt2_double_heads_loss_output(output_result)
191 | 
192 |     @classmethod
193 |     def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
194 |         """Creates a random int32 tensor of the shape within the vocab size."""
195 |         if rng is None:
196 |             rng = random.Random()
197 | 
198 |         total_dims = 1
199 |         for dim in shape:
200 |             total_dims *= dim
201 | 
202 |         values = []
203 |         for _ in range(total_dims):
204 |             values.append(rng.randint(0, vocab_size - 1))
205 | 
206 |         return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
207 | 
208 | 
209 | if __name__ == "__main__":
210 |     unittest.main()
211 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/file_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for working with the local dataset cache.
  3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
  4 | Copyright by the AllenNLP authors.
  5 | """
  6 | from __future__ import (absolute_import, division, print_function, unicode_literals)
  7 | 
  8 | import json
  9 | import logging
 10 | import os
 11 | import shutil
 12 | import tempfile
 13 | from functools import wraps
 14 | from hashlib import sha256
 15 | import sys
 16 | from io import open
 17 | 
 18 | import boto3
 19 | import requests
 20 | from botocore.exceptions import ClientError
 21 | from tqdm import tqdm
 22 | 
 23 | try:
 24 |     from urllib.parse import urlparse
 25 | except ImportError:
 26 |     from urlparse import urlparse
 27 | 
 28 | try:
 29 |     from pathlib import Path
 30 |     PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
 31 |                                                    Path.home() / '.pytorch_pretrained_bert'))
 32 | except AttributeError:
 33 |     PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
 34 |                                               os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
 35 | 
 36 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 37 | 
 38 | 
 39 | def url_to_filename(url, etag=None):
 40 |     """
 41 |     Convert `url` into a hashed filename in a repeatable way.
 42 |     If `etag` is specified, append its hash to the url's, delimited
 43 |     by a period.
 44 |     """
 45 |     url_bytes = url.encode('utf-8')
 46 |     url_hash = sha256(url_bytes)
 47 |     filename = url_hash.hexdigest()
 48 | 
 49 |     if etag:
 50 |         etag_bytes = etag.encode('utf-8')
 51 |         etag_hash = sha256(etag_bytes)
 52 |         filename += '.' + etag_hash.hexdigest()
 53 | 
 54 |     return filename
 55 | 
 56 | 
 57 | def filename_to_url(filename, cache_dir=None):
 58 |     """
 59 |     Return the url and etag (which may be ``None``) stored for `filename`.
 60 |     Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
 61 |     """
 62 |     if cache_dir is None:
 63 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
 64 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
 65 |         cache_dir = str(cache_dir)
 66 | 
 67 |     cache_path = os.path.join(cache_dir, filename)
 68 |     if not os.path.exists(cache_path):
 69 |         raise EnvironmentError("file {} not found".format(cache_path))
 70 | 
 71 |     meta_path = cache_path + '.json'
 72 |     if not os.path.exists(meta_path):
 73 |         raise EnvironmentError("file {} not found".format(meta_path))
 74 | 
 75 |     with open(meta_path, encoding="utf-8") as meta_file:
 76 |         metadata = json.load(meta_file)
 77 |     url = metadata['url']
 78 |     etag = metadata['etag']
 79 | 
 80 |     return url, etag
 81 | 
 82 | 
 83 | def cached_path(url_or_filename, cache_dir=None):
 84 |     """
 85 |     Given something that might be a URL (or might be a local path),
 86 |     determine which. If it's a URL, download the file and cache it, and
 87 |     return the path to the cached file. If it's already a local path,
 88 |     make sure the file exists and then return the path.
 89 |     """
 90 |     if cache_dir is None:
 91 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
 92 |     if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
 93 |         url_or_filename = str(url_or_filename)
 94 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
 95 |         cache_dir = str(cache_dir)
 96 | 
 97 |     parsed = urlparse(url_or_filename)
 98 | 
 99 |     if parsed.scheme in ('http', 'https', 's3'):
100 |         # URL, so get it from the cache (downloading if necessary)
101 |         return get_from_cache(url_or_filename, cache_dir)
102 |     elif os.path.exists(url_or_filename):
103 |         # File, and it exists.
104 |         return url_or_filename
105 |     elif parsed.scheme == '':
106 |         # File, but it doesn't exist.
107 |         raise EnvironmentError("file {} not found".format(url_or_filename))
108 |     else:
109 |         # Something unknown
110 |         raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
111 | 
112 | 
113 | def split_s3_path(url):
114 |     """Split a full s3 path into the bucket name and path."""
115 |     parsed = urlparse(url)
116 |     if not parsed.netloc or not parsed.path:
117 |         raise ValueError("bad s3 path {}".format(url))
118 |     bucket_name = parsed.netloc
119 |     s3_path = parsed.path
120 |     # Remove '/' at beginning of path.
121 |     if s3_path.startswith("/"):
122 |         s3_path = s3_path[1:]
123 |     return bucket_name, s3_path
124 | 
125 | 
126 | def s3_request(func):
127 |     """
128 |     Wrapper function for s3 requests in order to create more helpful error
129 |     messages.
130 |     """
131 | 
132 |     @wraps(func)
133 |     def wrapper(url, *args, **kwargs):
134 |         try:
135 |             return func(url, *args, **kwargs)
136 |         except ClientError as exc:
137 |             if int(exc.response["Error"]["Code"]) == 404:
138 |                 raise EnvironmentError("file {} not found".format(url))
139 |             else:
140 |                 raise
141 | 
142 |     return wrapper
143 | 
144 | 
145 | @s3_request
146 | def s3_etag(url):
147 |     """Check ETag on S3 object."""
148 |     s3_resource = boto3.resource("s3")
149 |     bucket_name, s3_path = split_s3_path(url)
150 |     s3_object = s3_resource.Object(bucket_name, s3_path)
151 |     return s3_object.e_tag
152 | 
153 | 
154 | @s3_request
155 | def s3_get(url, temp_file):
156 |     """Pull a file directly from S3."""
157 |     s3_resource = boto3.resource("s3")
158 |     bucket_name, s3_path = split_s3_path(url)
159 |     s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
160 | 
161 | 
162 | def http_get(url, temp_file):
163 |     req = requests.get(url, stream=True)
164 |     content_length = req.headers.get('Content-Length')
165 |     total = int(content_length) if content_length is not None else None
166 |     progress = tqdm(unit="B", total=total)
167 |     for chunk in req.iter_content(chunk_size=1024):
168 |         if chunk: # filter out keep-alive new chunks
169 |             progress.update(len(chunk))
170 |             temp_file.write(chunk)
171 |     progress.close()
172 | 
173 | 
174 | def get_from_cache(url, cache_dir=None):
175 |     """
176 |     Given a URL, look for the corresponding dataset in the local cache.
177 |     If it's not there, download it. Then return the path to the cached file.
178 |     """
179 |     if cache_dir is None:
180 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
181 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
182 |         cache_dir = str(cache_dir)
183 | 
184 |     if not os.path.exists(cache_dir):
185 |         os.makedirs(cache_dir)
186 | 
187 |     # Get eTag to add to filename, if it exists.
188 |     if url.startswith("s3://"):
189 |         etag = s3_etag(url)
190 |     else:
191 |         response = requests.head(url, allow_redirects=True)
192 |         if response.status_code != 200:
193 |             raise IOError("HEAD request failed for url {} with status code {}"
194 |                           .format(url, response.status_code))
195 |         etag = response.headers.get("ETag")
196 | 
197 |     filename = url_to_filename(url, etag)
198 | 
199 |     # get cache path to put the file
200 |     cache_path = os.path.join(cache_dir, filename)
201 | 
202 |     if not os.path.exists(cache_path):
203 |         # Download to temporary file, then copy to cache dir once finished.
204 |         # Otherwise you get corrupt cache entries if the download gets interrupted.
205 |         with tempfile.NamedTemporaryFile() as temp_file:
206 |             logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
207 | 
208 |             # GET file object
209 |             if url.startswith("s3://"):
210 |                 s3_get(url, temp_file)
211 |             else:
212 |                 http_get(url, temp_file)
213 | 
214 |             # we are copying the file before closing it, so flush to avoid truncation
215 |             temp_file.flush()
216 |             # shutil.copyfileobj() starts at the current position, so go to the start
217 |             temp_file.seek(0)
218 | 
219 |             logger.info("copying %s to cache at %s", temp_file.name, cache_path)
220 |             with open(cache_path, 'wb') as cache_file:
221 |                 shutil.copyfileobj(temp_file, cache_file)
222 | 
223 |             logger.info("creating metadata file for %s", cache_path)
224 |             meta = {'url': url, 'etag': etag}
225 |             meta_path = cache_path + '.json'
226 |             with open(meta_path, 'w', encoding="utf-8") as meta_file:
227 |                 json.dump(meta, meta_file)
228 | 
229 |             logger.info("removing temp file %s", temp_file.name)
230 | 
231 |     return cache_path
232 | 
233 | 
234 | def read_set_from_file(filename):
235 |     '''
236 |     Extract a de-duped collection (set) of text from a file.
237 |     Expected file format is one item per line.
238 |     '''
239 |     collection = set()
240 |     with open(filename, 'r', encoding='utf-8') as file_:
241 |         for line in file_:
242 |             collection.add(line.rstrip())
243 |     return collection
244 | 
245 | 
246 | def get_file_extension(path, dot=True, lower=True):
247 |     ext = os.path.splitext(path)[1]
248 |     ext = ext if dot else ext[1:]
249 |     return ext.lower() if lower else ext
250 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/tokenization_gpt2.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes for OpenAI GPT."""
 16 | from __future__ import (absolute_import, division, print_function,
 17 |                         unicode_literals)
 18 | 
 19 | import json
 20 | import logging
 21 | import os
 22 | import regex as re
 23 | from io import open
 24 | 
 25 | try:
 26 |     from functools import lru_cache
 27 | except ImportError:
 28 |     # Just a dummy decorator to get the checks to run on python2
 29 |     # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
 30 |     def lru_cache():
 31 |         return lambda func: func
 32 | 
 33 | from .file_utils import cached_path
 34 | 
 35 | logger = logging.getLogger(__name__)
 36 | 
 37 | PRETRAINED_VOCAB_ARCHIVE_MAP = {
 38 |     'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
 39 | }
 40 | PRETRAINED_MERGES_ARCHIVE_MAP = {
 41 |     'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
 42 | }
 43 | PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
 44 |     'gpt2': 1024,
 45 | }
 46 | VOCAB_NAME = 'vocab.json'
 47 | MERGES_NAME = 'merges.txt'
 48 | 
 49 | @lru_cache()
 50 | def bytes_to_unicode():
 51 |     """
 52 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 53 |     The reversible bpe codes work on unicode strings.
 54 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 55 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 56 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 57 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 58 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 59 |     """
 60 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 61 |     cs = bs[:]
 62 |     n = 0
 63 |     for b in range(2**8):
 64 |         if b not in bs:
 65 |             bs.append(b)
 66 |             cs.append(2**8+n)
 67 |             n += 1
 68 |     cs = [chr(n) for n in cs]
 69 |     return dict(zip(bs, cs))
 70 | 
 71 | def get_pairs(word):
 72 |     """Return set of symbol pairs in a word.
 73 | 
 74 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 75 |     """
 76 |     pairs = set()
 77 |     prev_char = word[0]
 78 |     for char in word[1:]:
 79 |         pairs.add((prev_char, char))
 80 |         prev_char = char
 81 |     return pairs
 82 | 
 83 | class GPT2Tokenizer(object):
 84 |     """
 85 |     GPT-2 BPE tokenizer. Peculiarities:
 86 |         - Byte-level BPE
 87 |     """
 88 |     @classmethod
 89 |     def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
 90 |         """
 91 |         Instantiate a PreTrainedBertModel from a pre-trained model file.
 92 |         Download and cache the pre-trained model file if needed.
 93 |         """
 94 |         if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
 95 |             vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
 96 |             merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
 97 |         else:
 98 |             vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
 99 |             merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
100 |         # redirect to the cache, if necessary
101 |         try:
102 |             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
103 |             resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
104 |         except EnvironmentError:
105 |             logger.error(
106 |                 "Model name '{}' was not found in model name list ({}). "
107 |                 "We assumed '{}' was a path or url but couldn't find files {} and {} "
108 |                 "at this path or url.".format(
109 |                     pretrained_model_name_or_path,
110 |                     ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
111 |                     pretrained_model_name_or_path,
112 |                     vocab_file, merges_file))
113 |             return None
114 |         if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
115 |             logger.info("loading vocabulary file {}".format(vocab_file))
116 |             logger.info("loading merges file {}".format(merges_file))
117 |         else:
118 |             logger.info("loading vocabulary file {} from cache at {}".format(
119 |                 vocab_file, resolved_vocab_file))
120 |             logger.info("loading merges file {} from cache at {}".format(
121 |                 merges_file, resolved_merges_file))
122 |         if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
123 |             # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
124 |             # than the number of positional embeddings
125 |             max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
126 |             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
127 |         # Instantiate tokenizer.
128 |         tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
129 |         return tokenizer
130 | 
131 |     def __init__(self, vocab_file, merges_file, errors='replace', max_len=None):
132 |         self.max_len = max_len if max_len is not None else int(1e12)
133 |         self.encoder = json.load(open(vocab_file))
134 |         self.decoder = {v:k for k,v in self.encoder.items()}
135 |         self.errors = errors # how to handle errors in decoding
136 |         self.byte_encoder = bytes_to_unicode()
137 |         self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
138 |         bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
139 |         bpe_merges = [tuple(merge.split()) for merge in bpe_data]
140 |         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
141 |         self.cache = {}
142 | 
143 |         # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
144 |         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
145 | 
146 |     def __len__(self):
147 |         return len(self.encoder)
148 | 
149 |     def bpe(self, token):
150 |         if token in self.cache:
151 |             return self.cache[token]
152 |         word = tuple(token)
153 |         pairs = get_pairs(word)
154 | 
155 |         if not pairs:
156 |             return token
157 | 
158 |         while True:
159 |             bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
160 |             if bigram not in self.bpe_ranks:
161 |                 break
162 |             first, second = bigram
163 |             new_word = []
164 |             i = 0
165 |             while i < len(word):
166 |                 try:
167 |                     j = word.index(first, i)
168 |                     new_word.extend(word[i:j])
169 |                     i = j
170 |                 except:
171 |                     new_word.extend(word[i:])
172 |                     break
173 | 
174 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
175 |                     new_word.append(first+second)
176 |                     i += 2
177 |                 else:
178 |                     new_word.append(word[i])
179 |                     i += 1
180 |             new_word = tuple(new_word)
181 |             word = new_word
182 |             if len(word) == 1:
183 |                 break
184 |             else:
185 |                 pairs = get_pairs(word)
186 |         word = ' '.join(word)
187 |         self.cache[token] = word
188 |         return word
189 | 
190 |     def encode(self, text):
191 |         bpe_tokens = []
192 |         for token in re.findall(self.pat, text):
193 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
194 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
195 |         if len(bpe_tokens) > self.max_len:
196 |             raise ValueError(
197 |                 "Token indices sequence length is longer than the specified maximum "
198 |                 " sequence length for this OpenAI GPT-2 model ({} > {}). Running this"
199 |                 " sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len)
200 |             )
201 |         return bpe_tokens
202 | 
203 |     def decode(self, tokens):
204 |         text = ''.join([self.decoder[token] for token in tokens])
205 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
206 |         return text
207 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/tests/modeling_transfo_xl_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import
 16 | from __future__ import division
 17 | from __future__ import print_function
 18 | 
 19 | import unittest
 20 | import json
 21 | import random
 22 | 
 23 | import torch
 24 | 
 25 | from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
 26 | 
 27 | 
 28 | class TransfoXLModelTest(unittest.TestCase):
 29 |     class TransfoXLModelTester(object):
 30 | 
 31 |         def __init__(self,
 32 |                      parent,
 33 |                      batch_size=13,
 34 |                      seq_length=7,
 35 |                      mem_len=30,
 36 |                      clamp_len=15,
 37 |                      is_training=True,
 38 |                      use_labels=True,
 39 |                      vocab_size=99,
 40 |                      cutoffs=[10, 50, 80],
 41 |                      d_model=32,
 42 |                      d_embed=32,
 43 |                      n_head=4,
 44 |                      d_head=8,
 45 |                      d_inner=128,
 46 |                      div_val=2,
 47 |                      n_layer=5,
 48 |                      scope=None,
 49 |                      seed=1):
 50 |             self.parent = parent
 51 |             self.batch_size = batch_size
 52 |             self.seq_length = seq_length
 53 |             self.mem_len = mem_len
 54 |             self.clamp_len = clamp_len
 55 |             self.is_training = is_training
 56 |             self.use_labels = use_labels
 57 |             self.vocab_size = vocab_size
 58 |             self.cutoffs = cutoffs
 59 |             self.d_model = d_model
 60 |             self.d_embed = d_embed
 61 |             self.n_head = n_head
 62 |             self.d_head = d_head
 63 |             self.d_inner = d_inner
 64 |             self.div_val = div_val
 65 |             self.n_layer = n_layer
 66 |             self.scope = scope
 67 |             self.seed = seed
 68 | 
 69 |         def prepare_config_and_inputs(self):
 70 |             input_ids_1 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 71 |             input_ids_2 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 72 | 
 73 |             lm_labels = None
 74 |             if self.use_labels:
 75 |                 lm_labels = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 76 | 
 77 |             config = TransfoXLConfig(
 78 |                 vocab_size_or_config_json_file=self.vocab_size,
 79 |                 mem_len=self.mem_len,
 80 |                 clamp_len=self.clamp_len,
 81 |                 cutoffs=self.cutoffs,
 82 |                 d_model=self.d_model,
 83 |                 d_embed=self.d_embed,
 84 |                 n_head=self.n_head,
 85 |                 d_head=self.d_head,
 86 |                 d_inner=self.d_inner,
 87 |                 div_val=self.div_val,
 88 |                 n_layer=self.n_layer)
 89 | 
 90 |             return (config, input_ids_1, input_ids_2, lm_labels)
 91 | 
 92 |         def set_seed(self):
 93 |             random.seed(self.seed)
 94 |             torch.manual_seed(self.seed)
 95 | 
 96 |         def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
 97 |             model = TransfoXLModel(config)
 98 |             model.eval()
 99 | 
100 |             hidden_states_1, mems_1 = model(input_ids_1)
101 |             hidden_states_2, mems_2 = model(input_ids_2, mems_1)
102 |             outputs = {
103 |                 "hidden_states_1": hidden_states_1,
104 |                 "mems_1": mems_1,
105 |                 "hidden_states_2": hidden_states_2,
106 |                 "mems_2": mems_2,
107 |             }
108 |             return outputs
109 | 
110 |         def check_transfo_xl_model_output(self, result):
111 |             self.parent.assertListEqual(
112 |                 list(result["hidden_states_1"].size()),
113 |                 [self.batch_size, self.seq_length, self.d_model])
114 |             self.parent.assertListEqual(
115 |                 list(result["hidden_states_2"].size()),
116 |                 [self.batch_size, self.seq_length, self.d_model])
117 |             self.parent.assertListEqual(
118 |                 list(list(mem.size()) for mem in result["mems_1"]),
119 |                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
120 |             self.parent.assertListEqual(
121 |                 list(list(mem.size()) for mem in result["mems_2"]),
122 |                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
123 | 
124 | 
125 |         def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
126 |             model = TransfoXLLMHeadModel(config)
127 |             model.eval()
128 | 
129 |             loss_1, mems_1a = model(input_ids_1, target=lm_labels)
130 |             lm_logits_1, mems_1b = model(input_ids_1)
131 | 
132 |             loss_2, mems_2a = model(input_ids_2, target=lm_labels, mems=mems_1a)
133 |             lm_logits_2, mems_2b = model(input_ids_2, mems=mems_1b)
134 | 
135 |             outputs = {
136 |                 "loss_1": loss_1,
137 |                 "mems_1a": mems_1a,
138 |                 "lm_logits_1": lm_logits_1,
139 |                 "mems_1b": mems_1b,
140 |                 "loss_2": loss_2,
141 |                 "mems_2a": mems_2a,
142 |                 "lm_logits_2": lm_logits_2,
143 |                 "mems_2b": mems_2b,
144 |             }
145 |             return outputs
146 | 
147 |         def check_transfo_xl_lm_head_output(self, result):
148 |             self.parent.assertListEqual(
149 |                 list(result["loss_1"].size()),
150 |                 [self.batch_size, self.seq_length])
151 |             self.parent.assertListEqual(
152 |                 list(result["lm_logits_1"].size()),
153 |                 [self.batch_size, self.seq_length, self.vocab_size])
154 |             self.parent.assertListEqual(
155 |                 list(list(mem.size()) for mem in result["mems_1a"]),
156 |                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
157 |             self.parent.assertListEqual(
158 |                 list(list(mem.size()) for mem in result["mems_1b"]),
159 |                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
160 |             self.parent.assertListEqual(
161 |                 list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1a"]),
162 |                 list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1b"]))
163 | 
164 |             self.parent.assertListEqual(
165 |                 list(result["loss_2"].size()),
166 |                 [self.batch_size, self.seq_length])
167 |             self.parent.assertListEqual(
168 |                 list(result["lm_logits_2"].size()),
169 |                 [self.batch_size, self.seq_length, self.vocab_size])
170 |             self.parent.assertListEqual(
171 |                 list(list(mem.size()) for mem in result["mems_2a"]),
172 |                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
173 |             self.parent.assertListEqual(
174 |                 list(list(mem.size()) for mem in result["mems_2b"]),
175 |                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
176 |             self.parent.assertListEqual(
177 |                 list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2a"]),
178 |                 list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2b"]))
179 | 
180 |     def test_default(self):
181 |         self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self))
182 | 
183 |     def test_config_to_json_string(self):
184 |         config = TransfoXLConfig(vocab_size_or_config_json_file=96, d_embed=37)
185 |         obj = json.loads(config.to_json_string())
186 |         self.assertEqual(obj["n_token"], 96)
187 |         self.assertEqual(obj["d_embed"], 37)
188 | 
189 |     def run_tester(self, tester):
190 |         config_and_inputs = tester.prepare_config_and_inputs()
191 | 
192 |         tester.set_seed()
193 |         output_result = tester.create_transfo_xl_model(*config_and_inputs)
194 |         tester.check_transfo_xl_model_output(output_result)
195 | 
196 |         tester.set_seed()
197 |         output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)
198 |         tester.check_transfo_xl_lm_head_output(output_result)
199 | 
200 |     @classmethod
201 |     def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
202 |         """Creates a random int32 tensor of the shape within the vocab size."""
203 |         if rng is None:
204 |             rng = random.Random()
205 | 
206 |         total_dims = 1
207 |         for dim in shape:
208 |             total_dims *= dim
209 | 
210 |         values = []
211 |         for _ in range(total_dims):
212 |             values.append(rng.randint(0, vocab_size - 1))
213 | 
214 |         return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
215 | 
216 | 
217 | if __name__ == "__main__":
218 |     unittest.main()
219 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/tests/modeling_openai_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import
 16 | from __future__ import division
 17 | from __future__ import print_function
 18 | 
 19 | import unittest
 20 | import json
 21 | import random
 22 | 
 23 | import torch
 24 | 
 25 | from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
 26 |                                      OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 27 | 
 28 | 
 29 | class OpenAIGPTModelTest(unittest.TestCase):
 30 |     class OpenAIGPTModelTester(object):
 31 | 
 32 |         def __init__(self,
 33 |                      parent,
 34 |                      batch_size=13,
 35 |                      seq_length=7,
 36 |                      is_training=True,
 37 |                      use_position_ids=True,
 38 |                      use_token_type_ids=True,
 39 |                      use_labels=True,
 40 |                      vocab_size=99,
 41 |                      n_special=1,
 42 |                      n_positions=33,
 43 |                      n_embd=32,
 44 |                      n_layer=5,
 45 |                      n_head=4,
 46 |                      n_choices=3,
 47 |                      afn="gelu",
 48 |                      resid_pdrop=0.1,
 49 |                      attn_pdrop=0.1,
 50 |                      embd_pdrop=0.1,
 51 |                      type_sequence_label_size=2,
 52 |                      initializer_range=0.02,
 53 |                      num_labels=3,
 54 |                      scope=None):
 55 |             self.parent = parent
 56 |             self.batch_size = batch_size
 57 |             self.seq_length = seq_length
 58 |             self.is_training = is_training
 59 |             self.use_position_ids = use_position_ids
 60 |             self.use_token_type_ids = use_token_type_ids
 61 |             self.use_labels = use_labels
 62 |             self.vocab_size = vocab_size
 63 |             self.n_special = n_special
 64 |             self.n_positions = n_positions
 65 |             self.n_embd = n_embd
 66 |             self.n_layer = n_layer
 67 |             self.n_head = n_head
 68 |             self.afn = afn
 69 |             self.n_choices = n_choices
 70 |             self.resid_pdrop = resid_pdrop
 71 |             self.attn_pdrop = attn_pdrop
 72 |             self.embd_pdrop = embd_pdrop
 73 |             self.type_sequence_label_size = type_sequence_label_size
 74 |             self.initializer_range = initializer_range
 75 |             self.num_labels = num_labels
 76 |             self.scope = scope
 77 | 
 78 |         def prepare_config_and_inputs(self):
 79 |             input_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
 80 | 
 81 |             position_ids = None
 82 |             if self.use_position_ids:
 83 |                 position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
 84 | 
 85 |             token_type_ids = None
 86 |             if self.use_token_type_ids:
 87 |                 total_voc = self.vocab_size + self.n_special
 88 |                 token_type_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
 89 | 
 90 |             mc_labels = None
 91 |             lm_labels = None
 92 |             mc_token_ids = None
 93 |             if self.use_labels:
 94 |                 mc_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
 95 |                 lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
 96 |                 mc_token_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length)
 97 | 
 98 |             config = OpenAIGPTConfig(
 99 |                 vocab_size_or_config_json_file=self.vocab_size,
100 |                 n_positions=self.n_positions,
101 |                 n_special=self.n_special,
102 |                 n_embd=self.n_embd,
103 |                 n_layer=self.n_layer,
104 |                 n_head=self.n_head,
105 |                 afn=self.afn,
106 |                 resid_pdrop=self.resid_pdrop,
107 |                 attn_pdrop=self.attn_pdrop,
108 |                 embd_pdrop=self.embd_pdrop,
109 |                 initializer_range=self.initializer_range)
110 | 
111 |             return (config, input_ids, token_type_ids, position_ids,
112 |                     mc_labels, lm_labels, mc_token_ids)
113 | 
114 |         def create_openai_model(self, config, input_ids, token_type_ids, position_ids,
115 |                                 mc_labels, lm_labels, mc_token_ids):
116 |             model = OpenAIGPTModel(config)
117 |             model.eval()
118 |             hidden_states = model(input_ids, position_ids, token_type_ids)
119 |             outputs = {
120 |                 "hidden_states": hidden_states,
121 |             }
122 |             return outputs
123 | 
124 |         def check_openai_model_output(self, result):
125 |             self.parent.assertListEqual(
126 |                 list(result["hidden_states"].size()),
127 |                 [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
128 | 
129 | 
130 |         def create_openai_lm_head(self, config, input_ids, token_type_ids, position_ids,
131 |                                        mc_labels, lm_labels, mc_token_ids):
132 |             model = OpenAIGPTLMHeadModel(config)
133 |             model.eval()
134 |             loss = model(input_ids, position_ids, token_type_ids, lm_labels)
135 |             lm_logits = model(input_ids, position_ids, token_type_ids)
136 |             outputs = {
137 |                 "loss": loss,
138 |                 "lm_logits": lm_logits,
139 |             }
140 |             return outputs
141 | 
142 |         def check_openai_lm_head_output(self, result):
143 |             total_voc = self.n_special + self.vocab_size
144 |             self.parent.assertListEqual(
145 |                 list(result["lm_logits"].size()),
146 |                 [self.batch_size, self.n_choices, self.seq_length, total_voc])
147 | 
148 |         def check_openai_lm_head_loss_output(self, result):
149 |             self.parent.assertListEqual(
150 |                 list(result["loss"].size()),
151 |                 [])
152 | 
153 |         def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids,
154 |                                        mc_labels, lm_labels, mc_token_ids):
155 |             model = OpenAIGPTDoubleHeadsModel(config)
156 |             model.eval()
157 |             loss = model(input_ids, mc_token_ids,
158 |                          lm_labels=lm_labels, mc_labels=mc_labels,
159 |                          token_type_ids=token_type_ids, position_ids=position_ids)
160 |             lm_logits, mc_logits = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
161 |             outputs = {
162 |                 "loss": loss,
163 |                 "lm_logits": lm_logits,
164 |                 "mc_logits": mc_logits,
165 |             }
166 |             return outputs
167 | 
168 |         def check_openai_double_heads_output(self, result):
169 |             total_voc = self.n_special + self.vocab_size
170 |             self.parent.assertListEqual(
171 |                 list(result["lm_logits"].size()),
172 |                 [self.batch_size, self.n_choices, self.seq_length, total_voc])
173 |             self.parent.assertListEqual(
174 |                 list(result["mc_logits"].size()),
175 |                 [self.batch_size, self.n_choices])
176 | 
177 |         def check_openai_double_heads_loss_output(self, result):
178 |             self.parent.assertListEqual(
179 |                 [list(l.size()) for l in result["loss"]],
180 |                 [[], []])
181 | 
182 |     def test_default(self):
183 |         self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self))
184 | 
185 |     def test_config_to_json_string(self):
186 |         config = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37)
187 |         obj = json.loads(config.to_json_string())
188 |         self.assertEqual(obj["vocab_size"], 99)
189 |         self.assertEqual(obj["n_embd"], 37)
190 | 
191 |     def run_tester(self, tester):
192 |         config_and_inputs = tester.prepare_config_and_inputs()
193 |         output_result = tester.create_openai_model(*config_and_inputs)
194 |         tester.check_openai_model_output(output_result)
195 | 
196 |         output_result = tester.create_openai_lm_head(*config_and_inputs)
197 |         tester.check_openai_lm_head_output(output_result)
198 |         tester.check_openai_lm_head_loss_output(output_result)
199 | 
200 |         output_result = tester.create_openai_double_heads(*config_and_inputs)
201 |         tester.check_openai_double_heads_output(output_result)
202 |         tester.check_openai_double_heads_loss_output(output_result)
203 | 
204 |     @classmethod
205 |     def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
206 |         """Creates a random int32 tensor of the shape within the vocab size."""
207 |         if rng is None:
208 |             rng = random.Random()
209 | 
210 |         total_dims = 1
211 |         for dim in shape:
212 |             total_dims *= dim
213 | 
214 |         values = []
215 |         for _ in range(total_dims):
216 |             values.append(rng.randint(0, vocab_size - 1))
217 | 
218 |         return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
219 | 
220 | 
221 | if __name__ == "__main__":
222 |     unittest.main()
223 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/examples/run_classifier_class.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | import argparse
  3 | import csv
  4 | import logging
  5 | import time
  6 | import os
  7 | import random
  8 | import sys
  9 | import re
 10 | import json
 11 | import numpy as np
 12 | import torch
 13 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 14 |                               TensorDataset)
 15 | from torch.utils.data.distributed import DistributedSampler
 16 | from tqdm import tqdm, trange
 17 | import torch.nn.functional as F
 18 | 
 19 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 20 | from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
 21 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 22 | from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
 23 | 
 24 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 25 |                     datefmt='%m/%d/%Y %H:%M:%S',
 26 |                     level=logging.INFO)
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | class InputExample(object):
 31 |     """A single training/test example for simple sequence classification."""
 32 | 
 33 |     def __init__(self, guid, text_a, text_b=None, label=None):
 34 |         """Constructs a InputExample.
 35 | 
 36 |         Args:
 37 |             guid: Unique id for the example.
 38 |             text_a: string. The untokenized text of the first sequence. For single
 39 |             sequence tasks, only this sequence must be specified.
 40 |             text_b: (Optional) string. The untokenized text of the second sequence.
 41 |             Only must be specified for sequence pair tasks.
 42 |             label: (Optional) string. The label of the example. This should be
 43 |             specified for train and dev examples, but not for test examples.
 44 |         """
 45 |         self.guid = guid
 46 |         self.text_a = text_a
 47 |         self.text_b = text_b
 48 |         self.label = label
 49 | 
 50 | 
 51 | class InputFeatures(object):
 52 |     """A single set of features of data."""
 53 | 
 54 |     def __init__(self, input_ids, input_mask, segment_ids, label_id):
 55 |         self.input_ids = input_ids
 56 |         self.input_mask = input_mask
 57 |         self.segment_ids = segment_ids
 58 |         self.label_id = label_id
 59 | 
 60 | 
 61 | class DataProcessor(object):
 62 |     """Base class for data converters for sequence classification data sets."""
 63 | 
 64 |     def get_train_examples(self, data_dir):
 65 |         """Gets a collection of `InputExample`s for the train set."""
 66 |         raise NotImplementedError()
 67 | 
 68 |     def get_dev_examples(self, data_dir):
 69 |         """Gets a collection of `InputExample`s for the dev set."""
 70 |         raise NotImplementedError()
 71 | 
 72 |     def get_labels(self):
 73 |         """Gets the list of labels for this data set."""
 74 |         raise NotImplementedError()
 75 | 
 76 |     def _read_txt(cls, line, quotechar=None):
 77 |         """Reads a tab separated value file."""
 78 |         lines = []
 79 |         line = line.strip()
 80 |         label = line[-1]
 81 |         text_1 = line[:-1].strip().split('#')[0]
 82 |         text_2 = line[:-1].strip().split('#')[1]
 83 |         ll_line = [text_1,text_2,label]
 84 |         lines.append(ll_line)
 85 |         return lines
 86 | 
 87 | 
 88 | class SimProcessor(DataProcessor):
 89 | 
 90 |     def get_dev_examples(self, line):
 91 | 
 92 |         return self._create_examples(self._read_txt(line), "dev")
 93 |         #序号、sen1、sen2、类别
 94 | 
 95 | 
 96 |     def _create_examples(self, lines, set_type):
 97 |         """Creates examples for the training and dev sets."""
 98 |         examples = []
 99 |         for (i, line) in enumerate(lines):
100 |             guid = "%s-%s" % (set_type, i)
101 |             text_a = line[0]
102 |             text_b = line[1]
103 |             label = line[2]
104 |             examples.append(
105 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
106 |         return examples
107 | 
108 | 
109 |     #返回所有的类别
110 |     def get_labels(self):
111 |         """See base class."""
112 |         return ["0", "1"]
113 | 
114 | 
115 | def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
116 |     """Loads a data file into a list of `InputBatch`s."""
117 | 
118 |     label_map = {label: i for i, label in enumerate(label_list)}
119 |     features = []
120 |     for (ex_index, example) in enumerate(examples):
121 |         tokens_a = tokenizer.tokenize(example.text_a)
122 |         tokens_b = None
123 |         if example.text_b:
124 |             tokens_b = tokenizer.tokenize(example.text_b)
125 |             _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
126 |         else:
127 |             if len(tokens_a) > max_seq_length - 2:
128 |                 tokens_a = tokens_a[:(max_seq_length - 2)]
129 |         tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
130 |         segment_ids = [0] * len(tokens)
131 |         if tokens_b:
132 |             tokens += tokens_b + ["[SEP]"]
133 |             segment_ids += [1] * (len(tokens_b) + 1)
134 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
135 |         input_mask = [1] * len(input_ids)
136 |         padding = [0] * (max_seq_length - len(input_ids))
137 |         input_ids += padding
138 |         input_mask += padding
139 |         segment_ids += padding
140 | 
141 |         assert len(input_ids) == max_seq_length
142 |         assert len(input_mask) == max_seq_length
143 |         assert len(segment_ids) == max_seq_length
144 | 
145 |         label_id = label_map[example.label]
146 |         if ex_index < 5:
147 |             logger.info("*** Example ***")
148 |             logger.info("guid: %s" % (example.guid))
149 |             logger.info("tokens: %s" % " ".join(
150 |                 [str(x) for x in tokens]))
151 |             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
152 |             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
153 |             logger.info(
154 |                 "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
155 |             logger.info("label: %s (id = %d)" % (example.label, label_id))
156 | 
157 |         features.append(
158 |             InputFeatures(input_ids=input_ids,
159 |                           input_mask=input_mask,
160 |                           segment_ids=segment_ids,
161 |                           label_id=label_id))
162 |     return features
163 | 
164 | 
165 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
166 |     """Truncates a sequence pair in place to the maximum length."""
167 | 
168 |     # This is a simple heuristic which will always truncate the longer sequence
169 |     # one token at a time. This makes more sense than truncating an equal percent
170 |     # of tokens from each, since if one sequence is very short then each token
171 |     # that's truncated likely contains more information than a longer sequence.
172 |     while True:
173 |         total_length = len(tokens_a) + len(tokens_b)
174 |         if total_length <= max_length:
175 |             break
176 |         if len(tokens_a) > len(tokens_b):
177 |             tokens_a.pop()
178 |         else:
179 |             tokens_b.pop()
180 | 
181 | 
182 | def accuracy(out, labels):
183 |     outputs = np.argmax(out, axis=1)
184 |     return np.sum(outputs == labels)
185 | 
186 | 
187 | class Predict:
188 |     def __init__(self):
189 |         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
190 |         c_bert_model = "./tmp_chinese/mrpc_output/"
191 |         raw_bert_model = "./models/chinese_L-12_H-768_A-12"
192 |         num_labels = 2
193 |         self.tokenizer = BertTokenizer.from_pretrained(raw_bert_model)
194 |         self.model = BertForSequenceClassification.from_pretrained(c_bert_model, num_labels=num_labels)
195 |         self.model.to(device)
196 | 
197 |     def predict(self, line):
198 |         processors = {
199 |             # "cola": ColaProcessor,
200 |             # "mnli": MnliProcessor,
201 |             "mrpc": SimProcessor
202 |         }
203 | 
204 |         num_labels_task = {
205 |             "mrpc": 2,
206 |         }
207 |         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
208 | 
209 | 
210 |         processor = processors['mrpc']()
211 |         label_list = processor.get_labels()
212 |         max_seq_length = 128
213 |         eval_batch_size = 8
214 | 
215 |         # tokenizer = BertTokenizer.from_pretrained(raw_bert_model)
216 |         # model = BertForSequenceClassification.from_pretrained(c_bert_model, num_labels=num_labels)
217 |         # model.to(device)
218 | 
219 |         test_line = line + "\t1"
220 | 
221 | 
222 |         eval_examples = processor.get_dev_examples(test_line)
223 |         eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, self.tokenizer)
224 | 
225 |         all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
226 |         all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
227 |         all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
228 |         all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
229 |         eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
230 |         # Run prediction for full data
231 |         eval_sampler = SequentialSampler(eval_data)
232 |         eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)
233 | 
234 |         self.model.eval()
235 | 
236 |         for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
237 |             input_ids = input_ids.to(device)
238 |             input_mask = input_mask.to(device)
239 |             segment_ids = segment_ids.to(device)
240 |             label_ids = label_ids.to(device)
241 | 
242 |             with torch.no_grad():
243 |                 tmp_eval_loss = self.model(input_ids, segment_ids, input_mask, label_ids)
244 |                 logits = self.model(input_ids, segment_ids, input_mask)
245 |             score = F.softmax(logits, 1)
246 |             maximum_probability = score.detach().cpu().numpy()[0].max()
247 |             print(maximum_probability)
248 |             logits = logits.detach().cpu().numpy()[0]
249 |             res = np.argmax(logits)
250 |             # return res
251 | 
252 |             id2Senti = {
253 |                 "0":'不同',
254 |                 "1":'相同',
255 |             }
256 | 
257 |             result = {
258 |                 'content': line,
259 |                 'result': id2Senti[str(res)],
260 |                 'probability': str(round(100*maximum_probability,2))+'%'
261 |             }
262 |             # return result
263 |             return json.dumps(result, ensure_ascii=False)
264 |             # label_ids = label_ids.to('cpu').numpy()
265 | 
266 | 
267 | if __name__ == "__main__":
268 |     p = Predict()
269 |     print(p.predict("你多大了？#你的年龄是多少？"))
270 |     #input_file = './chinese_data/data_dev.txt'
271 |     #sequence = read_txt(input_file)
272 |     #print(time.strftime("%H:%M:%S"))
273 |     #for i in range(len(sequence)):
274 |         #print(p.predict(sequence[i]))
275 |     #print(time.strftime("%H:%M:%S"))
276 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/tokenization_openai.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes for OpenAI GPT."""
 16 | from __future__ import (absolute_import, division, print_function,
 17 |                         unicode_literals)
 18 | 
 19 | import json
 20 | import logging
 21 | import os
 22 | import re
 23 | import sys
 24 | from io import open
 25 | 
 26 | from tqdm import tqdm
 27 | 
 28 | from .file_utils import cached_path
 29 | from .tokenization import BasicTokenizer
 30 | 
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | PRETRAINED_VOCAB_ARCHIVE_MAP = {
 34 |     'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
 35 | }
 36 | PRETRAINED_MERGES_ARCHIVE_MAP = {
 37 |     'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
 38 | }
 39 | PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
 40 |     'openai-gpt': 512,
 41 | }
 42 | VOCAB_NAME = 'vocab.json'
 43 | MERGES_NAME = 'merges.txt'
 44 | 
 45 | def get_pairs(word):
 46 |     """
 47 |     Return set of symbol pairs in a word.
 48 |     word is represented as tuple of symbols (symbols being variable-length strings)
 49 |     """
 50 |     pairs = set()
 51 |     prev_char = word[0]
 52 |     for char in word[1:]:
 53 |         pairs.add((prev_char, char))
 54 |         prev_char = char
 55 |     return pairs
 56 | 
 57 | def text_standardize(text):
 58 |     """
 59 |     fixes some issues the spacy tokenizer had on books corpus
 60 |     also does some whitespace standardization
 61 |     """
 62 |     text = text.replace('—', '-')
 63 |     text = text.replace('–', '-')
 64 |     text = text.replace('―', '-')
 65 |     text = text.replace('…', '...')
 66 |     text = text.replace('´', "'")
 67 |     text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
 68 |     text = re.sub(r'\s*\n\s*', ' \n ', text)
 69 |     text = re.sub(r'[^\S\n]+', ' ', text)
 70 |     return text.strip()
 71 | 
 72 | class OpenAIGPTTokenizer(object):
 73 |     """
 74 |     BPE tokenizer. Peculiarities:
 75 |         - lower case all inputs
 76 |         - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
 77 |         - argument special_tokens and function set_special_tokens:
 78 |             can be used to add additional symbols (ex: "__classify__") to a vocabulary.
 79 |     """
 80 |     @classmethod
 81 |     def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
 82 |         """
 83 |         Instantiate a PreTrainedBertModel from a pre-trained model file.
 84 |         Download and cache the pre-trained model file if needed.
 85 |         """
 86 |         if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
 87 |             vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
 88 |             merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
 89 |         else:
 90 |             vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
 91 |             merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
 92 |         # redirect to the cache, if necessary
 93 |         try:
 94 |             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
 95 |             resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
 96 |         except EnvironmentError:
 97 |             logger.error(
 98 |                 "Model name '{}' was not found in model name list ({}). "
 99 |                 "We assumed '{}' was a path or url but couldn't find files {} and {} "
100 |                 "at this path or url.".format(
101 |                     pretrained_model_name_or_path,
102 |                     ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
103 |                     pretrained_model_name_or_path,
104 |                     vocab_file, merges_file))
105 |             return None
106 |         if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
107 |             logger.info("loading vocabulary file {}".format(vocab_file))
108 |             logger.info("loading merges file {}".format(merges_file))
109 |         else:
110 |             logger.info("loading vocabulary file {} from cache at {}".format(
111 |                 vocab_file, resolved_vocab_file))
112 |             logger.info("loading merges file {} from cache at {}".format(
113 |                 merges_file, resolved_merges_file))
114 |         if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
115 |             # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
116 |             # than the number of positional embeddings
117 |             max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
118 |             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
119 |         # Instantiate tokenizer.
120 |         tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
121 |         return tokenizer
122 | 
123 |     def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
124 |         try:
125 |             import ftfy
126 |             import spacy
127 |             self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
128 |             self.fix_text = ftfy.fix_text
129 |         except ImportError:
130 |             logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
131 |             self.nlp = BasicTokenizer(do_lower_case=True,
132 |                                       never_split=special_tokens if special_tokens is not None else [])
133 |             self.fix_text = None
134 | 
135 |         self.max_len = max_len if max_len is not None else int(1e12)
136 |         self.encoder = json.load(open(vocab_file, encoding="utf-8"))
137 |         self.decoder = {v:k for k,v in self.encoder.items()}
138 |         merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
139 |         merges = [tuple(merge.split()) for merge in merges]
140 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
141 |         self.cache = {}
142 |         self.set_special_tokens(special_tokens)
143 | 
144 |     def __len__(self):
145 |         return len(self.encoder) + len(self.special_tokens)
146 | 
147 |     def set_special_tokens(self, special_tokens):
148 |         """ Add a list of additional tokens to the encoder.
149 |             The additional tokens are indexed starting from the last index of the
150 |             current vocabulary in the order of the `special_tokens` list.
151 |         """
152 |         if not special_tokens:
153 |             self.special_tokens = {}
154 |             self.special_tokens_decoder = {}
155 |             return
156 |         self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
157 |         self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
158 |         if self.fix_text is None:
159 |             # Using BERT's BasicTokenizer: we can update the tokenizer
160 |             self.nlp.never_split = special_tokens
161 |         logger.info("Special tokens {}".format(self.special_tokens))
162 | 
163 |     def bpe(self, token):
164 |         word = tuple(token[:-1]) + (token[-1] + '</w>',)
165 |         if token in self.cache:
166 |             return self.cache[token]
167 |         pairs = get_pairs(word)
168 | 
169 |         if not pairs:
170 |             return token+'</w>'
171 | 
172 |         while True:
173 |             bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
174 |             if bigram not in self.bpe_ranks:
175 |                 break
176 |             first, second = bigram
177 |             new_word = []
178 |             i = 0
179 |             while i < len(word):
180 |                 try:
181 |                     j = word.index(first, i)
182 |                     new_word.extend(word[i:j])
183 |                     i = j
184 |                 except:
185 |                     new_word.extend(word[i:])
186 |                     break
187 | 
188 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
189 |                     new_word.append(first+second)
190 |                     i += 2
191 |                 else:
192 |                     new_word.append(word[i])
193 |                     i += 1
194 |             new_word = tuple(new_word)
195 |             word = new_word
196 |             if len(word) == 1:
197 |                 break
198 |             else:
199 |                 pairs = get_pairs(word)
200 |         word = ' '.join(word)
201 |         if word == '\n  </w>':
202 |             word = '\n</w>'
203 |         self.cache[token] = word
204 |         return word
205 | 
206 |     def tokenize(self, text):
207 |         """ Tokenize a string. """
208 |         split_tokens = []
209 |         if self.fix_text is None:
210 |             # Using BERT's BasicTokenizer
211 |             text = self.nlp.tokenize(text)
212 |             for token in text:
213 |                 split_tokens.extend([t for t in self.bpe(token).split(' ')])
214 |         else:
215 |             # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
216 |             text = self.nlp(text_standardize(self.fix_text(text)))
217 |             for token in text:
218 |                 split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
219 |         return split_tokens
220 | 
221 |     def convert_tokens_to_ids(self, tokens):
222 |         """ Converts a sequence of tokens into ids using the vocab. """
223 |         ids = []
224 |         if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
225 |             if tokens in self.special_tokens:
226 |                 return self.special_tokens[tokens]
227 |             else:
228 |                 return self.encoder.get(tokens, 0)
229 |         for token in tokens:
230 |             if token in self.special_tokens:
231 |                 ids.append(self.special_tokens[token])
232 |             else:
233 |                 ids.append(self.encoder.get(token, 0))
234 |         if len(ids) > self.max_len:
235 |             raise ValueError(
236 |                 "Token indices sequence length is longer than the specified maximum "
237 |                 " sequence length for this OpenAI GPT model ({} > {}). Running this"
238 |                 " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
239 |             )
240 |         return ids
241 | 
242 |     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
243 |         """Converts a sequence of ids in BPE tokens using the vocab."""
244 |         tokens = []
245 |         for i in ids:
246 |             if i in self.special_tokens_decoder:
247 |                 if not skip_special_tokens:
248 |                     tokens.append(self.special_tokens_decoder[i])
249 |             else:
250 |                 tokens.append(self.decoder[i])
251 |         return tokens
252 | 
253 |     def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=False):
254 |         """Converts a sequence of ids in a string."""
255 |         tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
256 |         out_string = ''.join(tokens).replace('</w>', ' ').strip()
257 |         if clean_up_tokenization_spaces:
258 |             out_string = out_string.replace('<unk>', '')
259 |             out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ','
260 |                     ).replace(" n't", "n't").replace(" 'm", "'m").replace(" 're", "'re").replace(" do not", " don't"
261 |                     ).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m "
262 |                     ).replace(" 've", "'ve")
263 |         return out_string
264 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/examples/extract_features.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Extract pre-computed feature vectors from a PyTorch BERT model."""
 16 | 
 17 | #兼容python3.X的使用特性
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import argparse
 23 | import collections #对字典、元祖等数据结构的补充
 24 | import logging #日志logging模块
 25 | import json
 26 | import re
 27 | 
 28 | import torch
 29 | from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 30 | from torch.utils.data.distributed import DistributedSampler
 31 | 
 32 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 33 | from pytorch_pretrained_bert.modeling import BertModel
 34 | 
 35 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
 36 |                     datefmt = '%m/%d/%Y %H:%M:%S',
 37 |                     level = logging.INFO)
 38 | logger = logging.getLogger(__name__)
 39 | 
 40 | 
 41 | class InputExample(object):
 42 | 
 43 |     def __init__(self, unique_id, text_a, text_b):
 44 |         self.unique_id = unique_id
 45 |         self.text_a = text_a
 46 |         self.text_b = text_b
 47 | 
 48 | 
 49 | class InputFeatures(object):
 50 |     """A single set of features of data."""
 51 | 
 52 |     def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
 53 |         self.unique_id = unique_id
 54 |         self.tokens = tokens
 55 |         self.input_ids = input_ids
 56 |         self.input_mask = input_mask
 57 |         self.input_type_ids = input_type_ids
 58 | 
 59 | 
 60 | def convert_examples_to_features(examples, seq_length, tokenizer):
 61 |     """Loads a data file into a list of `InputBatch`s."""
 62 | 
 63 |     features = []
 64 |     for (ex_index, example) in enumerate(examples):
 65 |         tokens_a = tokenizer.tokenize(example.text_a)
 66 | 
 67 |         tokens_b = None
 68 |         if example.text_b:
 69 |             tokens_b = tokenizer.tokenize(example.text_b)
 70 | 
 71 |         if tokens_b:
 72 |             # Modifies `tokens_a` and `tokens_b` in place so that the total
 73 |             # length is less than the specified length.
 74 |             # Account for [CLS], [SEP], [SEP] with "- 3"
 75 |             _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
 76 |         else:
 77 |             # Account for [CLS] and [SEP] with "- 2"
 78 |             if len(tokens_a) > seq_length - 2:
 79 |                 tokens_a = tokens_a[0:(seq_length - 2)]
 80 | 
 81 |         # The convention in BERT is:
 82 |         # (a) For sequence pairs:
 83 |         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
 84 |         #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
 85 |         # (b) For single sequences:
 86 |         #  tokens:   [CLS] the dog is hairy . [SEP]
 87 |         #  type_ids:   0   0   0   0  0     0   0
 88 |         #
 89 |         # Where "type_ids" are used to indicate whether this is the first
 90 |         # sequence or the second sequence. The embedding vectors for `type=0` and
 91 |         # `type=1` were learned during pre-training and are added to the wordpiece
 92 |         # embedding vector (and position vector). This is not *strictly* necessary
 93 |         # since the [SEP] token unambigiously separates the sequences, but it makes
 94 |         # it easier for the model to learn the concept of sequences.
 95 |         #
 96 |         # For classification tasks, the first vector (corresponding to [CLS]) is
 97 |         # used as as the "sentence vector". Note that this only makes sense because
 98 |         # the entire model is fine-tuned.
 99 |         tokens = []
100 |         input_type_ids = []
101 |         tokens.append("[CLS]")
102 |         input_type_ids.append(0)
103 |         for token in tokens_a:
104 |             tokens.append(token)
105 |             input_type_ids.append(0)
106 |         tokens.append("[SEP]")
107 |         input_type_ids.append(0)
108 | 
109 |         if tokens_b:
110 |             for token in tokens_b:
111 |                 tokens.append(token)
112 |                 input_type_ids.append(1)
113 |             tokens.append("[SEP]")
114 |             input_type_ids.append(1)
115 | 
116 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
117 | 
118 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
119 |         # tokens are attended to.
120 |         input_mask = [1] * len(input_ids)
121 | 
122 |         # Zero-pad up to the sequence length.
123 |         while len(input_ids) < seq_length:
124 |             input_ids.append(0)
125 |             input_mask.append(0)
126 |             input_type_ids.append(0)
127 | 
128 |         assert len(input_ids) == seq_length
129 |         assert len(input_mask) == seq_length
130 |         assert len(input_type_ids) == seq_length
131 | 
132 |         if ex_index < 5:
133 |             logger.info("*** Example ***")
134 |             logger.info("unique_id: %s" % (example.unique_id))
135 |             logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
136 |             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
137 |             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
138 |             logger.info(
139 |                 "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
140 | 
141 |         features.append(
142 |             InputFeatures(
143 |                 unique_id=example.unique_id,
144 |                 tokens=tokens,
145 |                 input_ids=input_ids,
146 |                 input_mask=input_mask,
147 |                 input_type_ids=input_type_ids))
148 |     return features
149 | 
150 | 
151 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
152 |     """Truncates a sequence pair in place to the maximum length."""
153 | 
154 |     # This is a simple heuristic which will always truncate the longer sequence
155 |     # one token at a time. This makes more sense than truncating an equal percent
156 |     # of tokens from each, since if one sequence is very short then each token
157 |     # that's truncated likely contains more information than a longer sequence.
158 |     while True:
159 |         total_length = len(tokens_a) + len(tokens_b)
160 |         if total_length <= max_length:
161 |             break
162 |         if len(tokens_a) > len(tokens_b):
163 |             tokens_a.pop()
164 |         else:
165 |             tokens_b.pop()
166 | 
167 | 
168 | def read_examples(input_file):
169 |     """Read a list of `InputExample`s from an input file."""
170 |     examples = []
171 |     unique_id = 0
172 |     with open(input_file, "r", encoding='utf-8') as reader:
173 |         while True:
174 |             line = reader.readline()
175 |             if not line:
176 |                 break
177 |             line = line.strip()
178 |             text_a = None
179 |             text_b = None
180 |             m = re.match(r"^(.*) \|\|\| (.*)$", line)
181 |             if m is None:
182 |                 text_a = line
183 |             else:
184 |                 text_a = m.group(1)
185 |                 text_b = m.group(2)
186 |             examples.append(
187 |                 InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
188 |             unique_id += 1
189 |     return examples
190 | 
191 | 
192 | def main():
193 |     parser = argparse.ArgumentParser()
194 | 
195 |     ## Required parameters
196 |     parser.add_argument("--input_file", default=None, type=str, required=True)
197 |     parser.add_argument("--output_file", default=None, type=str, required=True)
198 |     parser.add_argument("--bert_model", default=None, type=str, required=True,
199 |                         help="Bert pre-trained model selected in the list: bert-base-uncased, "
200 |                              "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
201 | 
202 |     ## Other parameters
203 |     parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
204 |     parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
205 |     parser.add_argument("--max_seq_length", default=128, type=int,
206 |                         help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
207 |                             "than this will be truncated, and sequences shorter than this will be padded.")
208 |     parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
209 |     parser.add_argument("--local_rank",
210 |                         type=int,
211 |                         default=-1,
212 |                         help = "local_rank for distributed training on gpus")
213 |     parser.add_argument("--no_cuda",
214 |                         action='store_true',
215 |                         help="Whether not to use CUDA when available")
216 | 
217 |     args = parser.parse_args()
218 | 
219 |     if args.local_rank == -1 or args.no_cuda:
220 |         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
221 |         n_gpu = torch.cuda.device_count()
222 |     else:
223 |         device = torch.device("cuda", args.local_rank)
224 |         n_gpu = 1
225 |         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
226 |         torch.distributed.init_process_group(backend='nccl')
227 |     logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))
228 | 
229 |     layer_indexes = [int(x) for x in args.layers.split(",")]
230 | 
231 |     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
232 | 
233 |     examples = read_examples(args.input_file)
234 | 
235 |     features = convert_examples_to_features(
236 |         examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)
237 | 
238 |     unique_id_to_feature = {}
239 |     for feature in features:
240 |         unique_id_to_feature[feature.unique_id] = feature
241 | 
242 |     model = BertModel.from_pretrained(args.bert_model)
243 |     model.to(device)
244 | 
245 |     if args.local_rank != -1:
246 |         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
247 |                                                           output_device=args.local_rank)
248 |     elif n_gpu > 1:
249 |         model = torch.nn.DataParallel(model)
250 | 
251 |     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
252 |     all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
253 |     all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
254 | 
255 |     eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
256 |     if args.local_rank == -1:
257 |         eval_sampler = SequentialSampler(eval_data)
258 |     else:
259 |         eval_sampler = DistributedSampler(eval_data)
260 |     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
261 | 
262 |     model.eval()
263 |     with open(args.output_file, "w", encoding='utf-8') as writer:
264 |         for input_ids, input_mask, example_indices in eval_dataloader:
265 |             input_ids = input_ids.to(device)
266 |             input_mask = input_mask.to(device)
267 | 
268 |             all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
269 |             all_encoder_layers = all_encoder_layers
270 | 
271 |             for b, example_index in enumerate(example_indices):
272 |                 feature = features[example_index.item()]
273 |                 unique_id = int(feature.unique_id)
274 |                 # feature = unique_id_to_feature[unique_id]
275 |                 output_json = collections.OrderedDict()
276 |                 output_json["linex_index"] = unique_id
277 |                 all_out_features = []
278 |                 for (i, token) in enumerate(feature.tokens):
279 |                     all_layers = []
280 |                     for (j, layer_index) in enumerate(layer_indexes):
281 |                         layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
282 |                         layer_output = layer_output[b]
283 |                         layers = collections.OrderedDict()
284 |                         layers["index"] = layer_index
285 |                         layers["values"] = [
286 |                             round(x.item(), 6) for x in layer_output[i]
287 |                         ]
288 |                         all_layers.append(layers)
289 |                     out_features = collections.OrderedDict()
290 |                     out_features["token"] = token
291 |                     out_features["layers"] = all_layers
292 |                     all_out_features.append(out_features)
293 |                 output_json["features"] = all_out_features
294 |                 writer.write(json.dumps(output_json) + "\n")
295 | 
296 | 
297 | if __name__ == "__main__":
298 |     main()
299 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/tests/modeling_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import
 16 | from __future__ import division
 17 | from __future__ import print_function
 18 | 
 19 | import unittest
 20 | import json
 21 | import random
 22 | 
 23 | import torch
 24 | 
 25 | from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
 26 |                                      BertForNextSentencePrediction, BertForPreTraining,
 27 |                                      BertForQuestionAnswering, BertForSequenceClassification,
 28 |                                      BertForTokenClassification)
 29 | 
 30 | 
 31 | class BertModelTest(unittest.TestCase):
 32 |     class BertModelTester(object):
 33 | 
 34 |         def __init__(self,
 35 |                      parent,
 36 |                      batch_size=13,
 37 |                      seq_length=7,
 38 |                      is_training=True,
 39 |                      use_input_mask=True,
 40 |                      use_token_type_ids=True,
 41 |                      use_labels=True,
 42 |                      vocab_size=99,
 43 |                      hidden_size=32,
 44 |                      num_hidden_layers=5,
 45 |                      num_attention_heads=4,
 46 |                      intermediate_size=37,
 47 |                      hidden_act="gelu",
 48 |                      hidden_dropout_prob=0.1,
 49 |                      attention_probs_dropout_prob=0.1,
 50 |                      max_position_embeddings=512,
 51 |                      type_vocab_size=16,
 52 |                      type_sequence_label_size=2,
 53 |                      initializer_range=0.02,
 54 |                      num_labels=3,
 55 |                      scope=None):
 56 |             self.parent = parent
 57 |             self.batch_size = batch_size
 58 |             self.seq_length = seq_length
 59 |             self.is_training = is_training
 60 |             self.use_input_mask = use_input_mask
 61 |             self.use_token_type_ids = use_token_type_ids
 62 |             self.use_labels = use_labels
 63 |             self.vocab_size = vocab_size
 64 |             self.hidden_size = hidden_size
 65 |             self.num_hidden_layers = num_hidden_layers
 66 |             self.num_attention_heads = num_attention_heads
 67 |             self.intermediate_size = intermediate_size
 68 |             self.hidden_act = hidden_act
 69 |             self.hidden_dropout_prob = hidden_dropout_prob
 70 |             self.attention_probs_dropout_prob = attention_probs_dropout_prob
 71 |             self.max_position_embeddings = max_position_embeddings
 72 |             self.type_vocab_size = type_vocab_size
 73 |             self.type_sequence_label_size = type_sequence_label_size
 74 |             self.initializer_range = initializer_range
 75 |             self.num_labels = num_labels
 76 |             self.scope = scope
 77 | 
 78 |         def prepare_config_and_inputs(self):
 79 |             input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 80 | 
 81 |             input_mask = None
 82 |             if self.use_input_mask:
 83 |                 input_mask = BertModelTest.ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
 84 | 
 85 |             token_type_ids = None
 86 |             if self.use_token_type_ids:
 87 |                 token_type_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
 88 | 
 89 |             sequence_labels = None
 90 |             token_labels = None
 91 |             if self.use_labels:
 92 |                 sequence_labels = BertModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
 93 |                 token_labels = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.num_labels)
 94 | 
 95 |             config = BertConfig(
 96 |                 vocab_size_or_config_json_file=self.vocab_size,
 97 |                 hidden_size=self.hidden_size,
 98 |                 num_hidden_layers=self.num_hidden_layers,
 99 |                 num_attention_heads=self.num_attention_heads,
100 |                 intermediate_size=self.intermediate_size,
101 |                 hidden_act=self.hidden_act,
102 |                 hidden_dropout_prob=self.hidden_dropout_prob,
103 |                 attention_probs_dropout_prob=self.attention_probs_dropout_prob,
104 |                 max_position_embeddings=self.max_position_embeddings,
105 |                 type_vocab_size=self.type_vocab_size,
106 |                 initializer_range=self.initializer_range)
107 | 
108 |             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels
109 | 
110 |         def check_loss_output(self, result):
111 |             self.parent.assertListEqual(
112 |                 list(result["loss"].size()),
113 |                 [])
114 | 
115 |         def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
116 |             model = BertModel(config=config)
117 |             model.eval()
118 |             all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
119 |             outputs = {
120 |                 "sequence_output": all_encoder_layers[-1],
121 |                 "pooled_output": pooled_output,
122 |                 "all_encoder_layers": all_encoder_layers,
123 |             }
124 |             return outputs
125 | 
126 |         def check_bert_model_output(self, result):
127 |             self.parent.assertListEqual(
128 |                 [size for layer in result["all_encoder_layers"] for size in layer.size()],
129 |                 [self.batch_size, self.seq_length, self.hidden_size] * self.num_hidden_layers)
130 |             self.parent.assertListEqual(
131 |                 list(result["sequence_output"].size()),
132 |                 [self.batch_size, self.seq_length, self.hidden_size])
133 |             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
134 | 
135 | 
136 |         def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
137 |             model = BertForMaskedLM(config=config)
138 |             model.eval()
139 |             loss = model(input_ids, token_type_ids, input_mask, token_labels)
140 |             prediction_scores = model(input_ids, token_type_ids, input_mask)
141 |             outputs = {
142 |                 "loss": loss,
143 |                 "prediction_scores": prediction_scores,
144 |             }
145 |             return outputs
146 | 
147 |         def check_bert_for_masked_lm_output(self, result):
148 |             self.parent.assertListEqual(
149 |                 list(result["prediction_scores"].size()),
150 |                 [self.batch_size, self.seq_length, self.vocab_size])
151 | 
152 |         def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
153 |             model = BertForNextSentencePrediction(config=config)
154 |             model.eval()
155 |             loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
156 |             seq_relationship_score = model(input_ids, token_type_ids, input_mask)
157 |             outputs = {
158 |                 "loss": loss,
159 |                 "seq_relationship_score": seq_relationship_score,
160 |             }
161 |             return outputs
162 | 
163 |         def check_bert_for_next_sequence_prediction_output(self, result):
164 |             self.parent.assertListEqual(
165 |                 list(result["seq_relationship_score"].size()),
166 |                 [self.batch_size, 2])
167 | 
168 | 
169 |         def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
170 |             model = BertForPreTraining(config=config)
171 |             model.eval()
172 |             loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
173 |             prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask)
174 |             outputs = {
175 |                 "loss": loss,
176 |                 "prediction_scores": prediction_scores,
177 |                 "seq_relationship_score": seq_relationship_score,
178 |             }
179 |             return outputs
180 | 
181 |         def check_bert_for_pretraining_output(self, result):
182 |             self.parent.assertListEqual(
183 |                 list(result["prediction_scores"].size()),
184 |                 [self.batch_size, self.seq_length, self.vocab_size])
185 |             self.parent.assertListEqual(
186 |                 list(result["seq_relationship_score"].size()),
187 |                 [self.batch_size, 2])
188 | 
189 | 
190 |         def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
191 |             model = BertForQuestionAnswering(config=config)
192 |             model.eval()
193 |             loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
194 |             start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
195 |             outputs = {
196 |                 "loss": loss,
197 |                 "start_logits": start_logits,
198 |                 "end_logits": end_logits,
199 |             }
200 |             return outputs
201 | 
202 |         def check_bert_for_question_answering_output(self, result):
203 |             self.parent.assertListEqual(
204 |                 list(result["start_logits"].size()),
205 |                 [self.batch_size, self.seq_length])
206 |             self.parent.assertListEqual(
207 |                 list(result["end_logits"].size()),
208 |                 [self.batch_size, self.seq_length])
209 | 
210 | 
211 |         def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
212 |             model = BertForSequenceClassification(config=config, num_labels=self.num_labels)
213 |             model.eval()
214 |             loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
215 |             logits = model(input_ids, token_type_ids, input_mask)
216 |             outputs = {
217 |                 "loss": loss,
218 |                 "logits": logits,
219 |             }
220 |             return outputs
221 | 
222 |         def check_bert_for_sequence_classification_output(self, result):
223 |             self.parent.assertListEqual(
224 |                 list(result["logits"].size()),
225 |                 [self.batch_size, self.num_labels])
226 | 
227 | 
228 |         def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
229 |             model = BertForTokenClassification(config=config, num_labels=self.num_labels)
230 |             model.eval()
231 |             loss = model(input_ids, token_type_ids, input_mask, token_labels)
232 |             logits = model(input_ids, token_type_ids, input_mask)
233 |             outputs = {
234 |                 "loss": loss,
235 |                 "logits": logits,
236 |             }
237 |             return outputs
238 | 
239 |         def check_bert_for_token_classification_output(self, result):
240 |             self.parent.assertListEqual(
241 |                 list(result["logits"].size()),
242 |                 [self.batch_size, self.seq_length, self.num_labels])
243 | 
244 | 
245 |     def test_default(self):
246 |         self.run_tester(BertModelTest.BertModelTester(self))
247 | 
248 |     def test_config_to_json_string(self):
249 |         config = BertConfig(vocab_size_or_config_json_file=99, hidden_size=37)
250 |         obj = json.loads(config.to_json_string())
251 |         self.assertEqual(obj["vocab_size"], 99)
252 |         self.assertEqual(obj["hidden_size"], 37)
253 | 
254 |     def run_tester(self, tester):
255 |         config_and_inputs = tester.prepare_config_and_inputs()
256 |         output_result = tester.create_bert_model(*config_and_inputs)
257 |         tester.check_bert_model_output(output_result)
258 | 
259 |         output_result = tester.create_bert_for_masked_lm(*config_and_inputs)
260 |         tester.check_bert_for_masked_lm_output(output_result)
261 |         tester.check_loss_output(output_result)
262 | 
263 |         output_result = tester.create_bert_for_next_sequence_prediction(*config_and_inputs)
264 |         tester.check_bert_for_next_sequence_prediction_output(output_result)
265 |         tester.check_loss_output(output_result)
266 | 
267 |         output_result = tester.create_bert_for_pretraining(*config_and_inputs)
268 |         tester.check_bert_for_pretraining_output(output_result)
269 |         tester.check_loss_output(output_result)
270 | 
271 |         output_result = tester.create_bert_for_question_answering(*config_and_inputs)
272 |         tester.check_bert_for_question_answering_output(output_result)
273 |         tester.check_loss_output(output_result)
274 | 
275 |         output_result = tester.create_bert_for_sequence_classification(*config_and_inputs)
276 |         tester.check_bert_for_sequence_classification_output(output_result)
277 |         tester.check_loss_output(output_result)
278 | 
279 |         output_result = tester.create_bert_for_token_classification(*config_and_inputs)
280 |         tester.check_bert_for_token_classification_output(output_result)
281 |         tester.check_loss_output(output_result)
282 | 
283 |     @classmethod
284 |     def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
285 |         """Creates a random int32 tensor of the shape within the vocab size."""
286 |         if rng is None:
287 |             rng = random.Random()
288 | 
289 |         total_dims = 1
290 |         for dim in shape:
291 |             total_dims *= dim
292 | 
293 |         values = []
294 |         for _ in range(total_dims):
295 |             values.append(rng.randint(0, vocab_size - 1))
296 | 
297 |         return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
298 | 
299 | 
300 | if __name__ == "__main__":
301 |     unittest.main()
302 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | 
 19 | import collections
 20 | import logging
 21 | import os
 22 | import unicodedata
 23 | from io import open
 24 | 
 25 | from .file_utils import cached_path
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | PRETRAINED_VOCAB_ARCHIVE_MAP = {
 30 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
 31 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
 32 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
 33 |     'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
 34 |     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
 35 |     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
 36 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
 37 | }
 38 | PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
 39 |     'bert-base-uncased': 512,
 40 |     'bert-large-uncased': 512,
 41 |     'bert-base-cased': 512,
 42 |     'bert-large-cased': 512,
 43 |     'bert-base-multilingual-uncased': 512,
 44 |     'bert-base-multilingual-cased': 512,
 45 |     'bert-base-chinese': 512,
 46 | }
 47 | VOCAB_NAME = 'vocab.txt'
 48 | 
 49 | 
 50 | def load_vocab(vocab_file):
 51 |     """Loads a vocabulary file into a dictionary."""
 52 |     vocab = collections.OrderedDict()
 53 |     index = 0
 54 |     with open(vocab_file, "r", encoding="utf-8") as reader:
 55 |         while True:
 56 |             token = reader.readline()
 57 |             if not token:
 58 |                 break
 59 |             token = token.strip()
 60 |             vocab[token] = index
 61 |             index += 1
 62 |     return vocab
 63 | 
 64 | 
 65 | def whitespace_tokenize(text):
 66 |     """Runs basic whitespace cleaning and splitting on a piece of text."""
 67 |     text = text.strip()
 68 |     if not text:
 69 |         return []
 70 |     tokens = text.split()
 71 |     return tokens
 72 | 
 73 | 
 74 | class BertTokenizer(object):
 75 |     """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
 76 | 
 77 |     def __init__(self, vocab_file, do_lower_case=True, max_len=None,
 78 |                  never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
 79 |         if not os.path.isfile(vocab_file):
 80 |             raise ValueError(
 81 |                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
 82 |                 "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
 83 |         self.vocab = load_vocab(vocab_file)
 84 |         self.ids_to_tokens = collections.OrderedDict(
 85 |             [(ids, tok) for tok, ids in self.vocab.items()])
 86 |         self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
 87 |                                               never_split=never_split)
 88 |         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
 89 |         self.max_len = max_len if max_len is not None else int(1e12)
 90 | 
 91 |     def tokenize(self, text):
 92 |         split_tokens = []
 93 |         for token in self.basic_tokenizer.tokenize(text):
 94 |             for sub_token in self.wordpiece_tokenizer.tokenize(token):
 95 |                 split_tokens.append(sub_token)
 96 |         return split_tokens
 97 | 
 98 |     def convert_tokens_to_ids(self, tokens):
 99 |         """Converts a sequence of tokens into ids using the vocab."""
100 |         ids = []
101 |         for token in tokens:
102 |             ids.append(self.vocab[token])
103 |         if len(ids) > self.max_len:
104 |             raise ValueError(
105 |                 "Token indices sequence length is longer than the specified maximum "
106 |                 " sequence length for this BERT model ({} > {}). Running this"
107 |                 " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
108 |             )
109 |         return ids
110 | 
111 |     def convert_ids_to_tokens(self, ids):
112 |         """Converts a sequence of ids in wordpiece tokens using the vocab."""
113 |         tokens = []
114 |         for i in ids:
115 |             tokens.append(self.ids_to_tokens[i])
116 |         return tokens
117 | 
118 |     @classmethod
119 |     def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
120 |         """
121 |         Instantiate a PreTrainedBertModel from a pre-trained model file.
122 |         Download and cache the pre-trained model file if needed.
123 |         """
124 |         if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
125 |             vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
126 |         else:
127 |             vocab_file = pretrained_model_name_or_path
128 |         if os.path.isdir(vocab_file):
129 |             vocab_file = os.path.join(vocab_file, VOCAB_NAME)
130 |         # redirect to the cache, if necessary
131 |         try:
132 |             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
133 |         except EnvironmentError:
134 |             logger.error(
135 |                 "Model name '{}' was not found in model name list ({}). "
136 |                 "We assumed '{}' was a path or url but couldn't find any file "
137 |                 "associated to this path or url.".format(
138 |                     pretrained_model_name_or_path,
139 |                     ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
140 |                     vocab_file))
141 |             return None
142 |         if resolved_vocab_file == vocab_file:
143 |             logger.info("loading vocabulary file {}".format(vocab_file))
144 |         else:
145 |             logger.info("loading vocabulary file {} from cache at {}".format(
146 |                 vocab_file, resolved_vocab_file))
147 |         if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
148 |             # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
149 |             # than the number of positional embeddings
150 |             max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
151 |             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
152 |         # Instantiate tokenizer.
153 |         tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
154 |         return tokenizer
155 | 
156 | 
157 | class BasicTokenizer(object):
158 |     """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
159 | 
160 |     def __init__(self,
161 |                  do_lower_case=True,
162 |                  never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
163 |         """Constructs a BasicTokenizer.
164 | 
165 |         Args:
166 |           do_lower_case: Whether to lower case the input.
167 |         """
168 |         self.do_lower_case = do_lower_case
169 |         self.never_split = never_split
170 | 
171 |     def tokenize(self, text):
172 |         """Tokenizes a piece of text."""
173 |         text = self._clean_text(text)
174 |         # This was added on November 1st, 2018 for the multilingual and Chinese
175 |         # models. This is also applied to the English models now, but it doesn't
176 |         # matter since the English models were not trained on any Chinese data
177 |         # and generally don't have any Chinese data in them (there are Chinese
178 |         # characters in the vocabulary because Wikipedia does have some Chinese
179 |         # words in the English Wikipedia.).
180 |         text = self._tokenize_chinese_chars(text)
181 |         orig_tokens = whitespace_tokenize(text)
182 |         split_tokens = []
183 |         for token in orig_tokens:
184 |             if self.do_lower_case and token not in self.never_split:
185 |                 token = token.lower()
186 |                 token = self._run_strip_accents(token)
187 |             split_tokens.extend(self._run_split_on_punc(token))
188 | 
189 |         output_tokens = whitespace_tokenize(" ".join(split_tokens))
190 |         return output_tokens
191 | 
192 |     def _run_strip_accents(self, text):
193 |         """Strips accents from a piece of text."""
194 |         text = unicodedata.normalize("NFD", text)
195 |         output = []
196 |         for char in text:
197 |             cat = unicodedata.category(char)
198 |             if cat == "Mn":
199 |                 continue
200 |             output.append(char)
201 |         return "".join(output)
202 | 
203 |     def _run_split_on_punc(self, text):
204 |         """Splits punctuation on a piece of text."""
205 |         if text in self.never_split:
206 |             return [text]
207 |         chars = list(text)
208 |         i = 0
209 |         start_new_word = True
210 |         output = []
211 |         while i < len(chars):
212 |             char = chars[i]
213 |             if _is_punctuation(char):
214 |                 output.append([char])
215 |                 start_new_word = True
216 |             else:
217 |                 if start_new_word:
218 |                     output.append([])
219 |                 start_new_word = False
220 |                 output[-1].append(char)
221 |             i += 1
222 | 
223 |         return ["".join(x) for x in output]
224 | 
225 |     def _tokenize_chinese_chars(self, text):
226 |         """Adds whitespace around any CJK character."""
227 |         output = []
228 |         for char in text:
229 |             cp = ord(char)
230 |             if self._is_chinese_char(cp):
231 |                 output.append(" ")
232 |                 output.append(char)
233 |                 output.append(" ")
234 |             else:
235 |                 output.append(char)
236 |         return "".join(output)
237 | 
238 |     def _is_chinese_char(self, cp):
239 |         """Checks whether CP is the codepoint of a CJK character."""
240 |         # This defines a "chinese character" as anything in the CJK Unicode block:
241 |         #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
242 |         #
243 |         # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
244 |         # despite its name. The modern Korean Hangul alphabet is a different block,
245 |         # as is Japanese Hiragana and Katakana. Those alphabets are used to write
246 |         # space-separated words, so they are not treated specially and handled
247 |         # like the all of the other languages.
248 |         if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
249 |                 (cp >= 0x3400 and cp <= 0x4DBF) or  #
250 |                 (cp >= 0x20000 and cp <= 0x2A6DF) or  #
251 |                 (cp >= 0x2A700 and cp <= 0x2B73F) or  #
252 |                 (cp >= 0x2B740 and cp <= 0x2B81F) or  #
253 |                 (cp >= 0x2B820 and cp <= 0x2CEAF) or
254 |                 (cp >= 0xF900 and cp <= 0xFAFF) or  #
255 |                 (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
256 |             return True
257 | 
258 |         return False
259 | 
260 |     def _clean_text(self, text):
261 |         """Performs invalid character removal and whitespace cleanup on text."""
262 |         output = []
263 |         for char in text:
264 |             cp = ord(char)
265 |             if cp == 0 or cp == 0xfffd or _is_control(char):
266 |                 continue
267 |             if _is_whitespace(char):
268 |                 output.append(" ")
269 |             else:
270 |                 output.append(char)
271 |         return "".join(output)
272 | 
273 | 
274 | class WordpieceTokenizer(object):
275 |     """Runs WordPiece tokenization."""
276 | 
277 |     def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
278 |         self.vocab = vocab
279 |         self.unk_token = unk_token
280 |         self.max_input_chars_per_word = max_input_chars_per_word
281 | 
282 |     def tokenize(self, text):
283 |         """Tokenizes a piece of text into its word pieces.
284 | 
285 |         This uses a greedy longest-match-first algorithm to perform tokenization
286 |         using the given vocabulary.
287 | 
288 |         For example:
289 |           input = "unaffable"
290 |           output = ["un", "##aff", "##able"]
291 | 
292 |         Args:
293 |           text: A single token or whitespace separated tokens. This should have
294 |             already been passed through `BasicTokenizer`.
295 | 
296 |         Returns:
297 |           A list of wordpiece tokens.
298 |         """
299 | 
300 |         output_tokens = []
301 |         for token in whitespace_tokenize(text):
302 |             chars = list(token)
303 |             if len(chars) > self.max_input_chars_per_word:
304 |                 output_tokens.append(self.unk_token)
305 |                 continue
306 | 
307 |             is_bad = False
308 |             start = 0
309 |             sub_tokens = []
310 |             while start < len(chars):
311 |                 end = len(chars)
312 |                 cur_substr = None
313 |                 while start < end:
314 |                     substr = "".join(chars[start:end])
315 |                     if start > 0:
316 |                         substr = "##" + substr
317 |                     if substr in self.vocab:
318 |                         cur_substr = substr
319 |                         break
320 |                     end -= 1
321 |                 if cur_substr is None:
322 |                     is_bad = True
323 |                     break
324 |                 sub_tokens.append(cur_substr)
325 |                 start = end
326 | 
327 |             if is_bad:
328 |                 output_tokens.append(self.unk_token)
329 |             else:
330 |                 output_tokens.extend(sub_tokens)
331 |         return output_tokens
332 | 
333 | 
334 | def _is_whitespace(char):
335 |     """Checks whether `chars` is a whitespace character."""
336 |     # \t, \n, and \r are technically contorl characters but we treat them
337 |     # as whitespace since they are generally considered as such.
338 |     if char == " " or char == "\t" or char == "\n" or char == "\r":
339 |         return True
340 |     cat = unicodedata.category(char)
341 |     if cat == "Zs":
342 |         return True
343 |     return False
344 | 
345 | 
346 | def _is_control(char):
347 |     """Checks whether `chars` is a control character."""
348 |     # These are technically control characters but we count them as whitespace
349 |     # characters.
350 |     if char == "\t" or char == "\n" or char == "\r":
351 |         return False
352 |     cat = unicodedata.category(char)
353 |     if cat.startswith("C"):
354 |         return True
355 |     return False
356 | 
357 | 
358 | def _is_punctuation(char):
359 |     """Checks whether `chars` is a punctuation character."""
360 |     cp = ord(char)
361 |     # We treat all non-letter/number ASCII as punctuation.
362 |     # Characters such as "^", "$", and "`" are not in the Unicode
363 |     # Punctuation class but we treat them as punctuation anyways, for
364 |     # consistency.
365 |     if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
366 |             (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
367 |         return True
368 |     cat = unicodedata.category(char)
369 |     if cat.startswith("P"):
370 |         return True
371 |     return False
372 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Utilities for PyTorch Transformer XL model.
 17 |     Directly adapted from https://github.com/kimiyoung/transformer-xl.
 18 | """
 19 | 
 20 | from collections import defaultdict
 21 | 
 22 | import numpy as np
 23 | 
 24 | import torch
 25 | import torch.nn as nn
 26 | import torch.nn.functional as F
 27 | 
 28 | # CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
 29 | # CUDA_MINOR = int(torch.version.cuda.split('.')[1])
 30 | 
 31 | class ProjectedAdaptiveLogSoftmax(nn.Module):
 32 |     def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
 33 |                  keep_order=False):
 34 |         super(ProjectedAdaptiveLogSoftmax, self).__init__()
 35 | 
 36 |         self.n_token = n_token
 37 |         self.d_embed = d_embed
 38 |         self.d_proj = d_proj
 39 | 
 40 |         self.cutoffs = cutoffs + [n_token]
 41 |         self.cutoff_ends = [0] + self.cutoffs
 42 |         self.div_val = div_val
 43 | 
 44 |         self.shortlist_size = self.cutoffs[0]
 45 |         self.n_clusters = len(self.cutoffs) - 1
 46 |         self.head_size = self.shortlist_size + self.n_clusters
 47 | 
 48 |         if self.n_clusters > 0:
 49 |             self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
 50 |             self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
 51 | 
 52 |         self.out_layers = nn.ModuleList()
 53 |         self.out_projs = nn.ParameterList()
 54 | 
 55 |         if div_val == 1:
 56 |             for i in range(len(self.cutoffs)):
 57 |                 if d_proj != d_embed:
 58 |                     self.out_projs.append(
 59 |                         nn.Parameter(torch.Tensor(d_proj, d_embed))
 60 |                     )
 61 |                 else:
 62 |                     self.out_projs.append(None)
 63 | 
 64 |             self.out_layers.append(nn.Linear(d_embed, n_token))
 65 |         else:
 66 |             for i in range(len(self.cutoffs)):
 67 |                 l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
 68 |                 d_emb_i = d_embed // (div_val ** i)
 69 | 
 70 |                 self.out_projs.append(
 71 |                     nn.Parameter(torch.Tensor(d_proj, d_emb_i))
 72 |                 )
 73 | 
 74 |                 self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx))
 75 | 
 76 |         self.keep_order = keep_order
 77 | 
 78 |     def _compute_logit(self, hidden, weight, bias, proj):
 79 |         if proj is None:
 80 |             logit = F.linear(hidden, weight, bias=bias)
 81 |         else:
 82 |             # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
 83 |             proj_hid = F.linear(hidden, proj.t().contiguous())
 84 |             logit = F.linear(proj_hid, weight, bias=bias)
 85 |             # else:
 86 |             #     logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
 87 |             #     if bias is not None:
 88 |             #         logit = logit + bias
 89 | 
 90 |         return logit
 91 | 
 92 |     def forward(self, hidden, target=None, keep_order=False):
 93 |         '''
 94 |             Params:
 95 |                 hidden :: [len*bsz x d_proj]
 96 |                 target :: [len*bsz]
 97 |             Return:
 98 |                 if target is None:
 99 |                     out :: [len*bsz] Negative log likelihood
100 |                 else:
101 |                     out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
102 |             We could replace this implementation by the native PyTorch one
103 |             if their's had an option to set bias on all clusters in the native one.
104 |             here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
105 |         '''
106 | 
107 |         if target is not None:
108 |             target = target.view(-1)
109 |             if hidden.size(0) != target.size(0):
110 |                 raise RuntimeError('Input and target should have the same size '
111 |                                 'in the batch dimension.')
112 | 
113 |         if self.n_clusters == 0:
114 |             logit = self._compute_logit(hidden, self.out_layers[0].weight,
115 |                                         self.out_layers[0].bias, self.out_projs[0])
116 |             if target is not None:
117 |                 output = -F.log_softmax(logit, dim=-1) \
118 |                         .gather(1, target.unsqueeze(1)).squeeze(1)
119 |             else:
120 |                 output = F.log_softmax(logit, dim=-1)
121 |         else:
122 |             # construct weights and biases
123 |             weights, biases = [], []
124 |             for i in range(len(self.cutoffs)):
125 |                 if self.div_val == 1:
126 |                     l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
127 |                     weight_i = self.out_layers[0].weight[l_idx:r_idx]
128 |                     bias_i = self.out_layers[0].bias[l_idx:r_idx]
129 |                 else:
130 |                     weight_i = self.out_layers[i].weight
131 |                     bias_i = self.out_layers[i].bias
132 | 
133 |                 if i == 0:
134 |                     weight_i = torch.cat(
135 |                         [weight_i, self.cluster_weight], dim=0)
136 |                     bias_i = torch.cat(
137 |                         [bias_i, self.cluster_bias], dim=0)
138 | 
139 |                 weights.append(weight_i)
140 |                 biases.append(bias_i)
141 | 
142 |             head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
143 | 
144 |             head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
145 |             head_logprob = F.log_softmax(head_logit, dim=1)
146 | 
147 |             if target is None:
148 |                 out = hidden.new_empty((head_logit.size(0), self.n_token))
149 |             else:
150 |                 out = torch.zeros_like(target, dtype=hidden.dtype, device=hidden.device)
151 | 
152 |             offset = 0
153 |             cutoff_values = [0] + self.cutoffs
154 |             for i in range(len(cutoff_values) - 1):
155 |                 l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]
156 | 
157 |                 if target is not None:
158 |                     mask_i = (target >= l_idx) & (target < r_idx)
159 |                     indices_i = mask_i.nonzero().squeeze()
160 | 
161 |                     if indices_i.numel() == 0:
162 |                         continue
163 | 
164 |                     target_i = target.index_select(0, indices_i) - l_idx
165 |                     head_logprob_i = head_logprob.index_select(0, indices_i)
166 |                     hidden_i = hidden.index_select(0, indices_i)
167 |                 else:
168 |                     hidden_i = hidden
169 | 
170 |                 if i == 0:
171 |                     if target is not None:
172 |                         logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)
173 |                     else:
174 |                         out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
175 |                 else:
176 |                     weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
177 | 
178 |                     tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
179 |                     tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
180 |                     cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
181 |                     if target is not None:
182 |                         logprob_i = head_logprob_i[:, cluster_prob_idx] \
183 |                                 + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)
184 |                     else:
185 |                         logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i
186 |                         out[:, l_idx:r_idx] = logprob_i
187 | 
188 |                 if target is not None:
189 |                     if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
190 |                         out.index_copy_(0, indices_i, -logprob_i)
191 |                     else:
192 |                         out[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
193 |                     offset += logprob_i.size(0)
194 | 
195 |         return out
196 | 
197 | 
198 |     def log_prob(self, hidden):
199 |         r""" Computes log probabilities for all :math:`n\_classes`
200 |         From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
201 |         Args:
202 |             hidden (Tensor): a minibatch of examples
203 |         Returns:
204 |             log-probabilities of for each class :math:`c`
205 |             in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a
206 |             parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
207 |         Shape:
208 |             - Input: :math:`(N, in\_features)`
209 |             - Output: :math:`(N, n\_classes)`
210 |         """
211 |         if self.n_clusters == 0:
212 |             logit = self._compute_logit(hidden, self.out_layers[0].weight,
213 |                                         self.out_layers[0].bias, self.out_projs[0])
214 |             return F.log_softmax(logit, dim=-1)
215 |         else:
216 |             # construct weights and biases
217 |             weights, biases = [], []
218 |             for i in range(len(self.cutoffs)):
219 |                 if self.div_val == 1:
220 |                     l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
221 |                     weight_i = self.out_layers[0].weight[l_idx:r_idx]
222 |                     bias_i = self.out_layers[0].bias[l_idx:r_idx]
223 |                 else:
224 |                     weight_i = self.out_layers[i].weight
225 |                     bias_i = self.out_layers[i].bias
226 | 
227 |                 if i == 0:
228 |                     weight_i = torch.cat(
229 |                         [weight_i, self.cluster_weight], dim=0)
230 |                     bias_i = torch.cat(
231 |                         [bias_i, self.cluster_bias], dim=0)
232 | 
233 |                 weights.append(weight_i)
234 |                 biases.append(bias_i)
235 | 
236 |             head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
237 |             head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
238 | 
239 |             out = hidden.new_empty((head_logit.size(0), self.n_token))
240 |             head_logprob = F.log_softmax(head_logit, dim=1)
241 | 
242 |             cutoff_values = [0] + self.cutoffs
243 |             for i in range(len(cutoff_values) - 1):
244 |                 start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1]
245 | 
246 |                 if i == 0:
247 |                     out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
248 |                 else:
249 |                     weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
250 | 
251 |                     tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i)
252 |                     tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
253 | 
254 |                     logprob_i = head_logprob[:, -i] + tail_logprob_i
255 |                     out[:, start_idx, stop_idx] = logprob_i
256 | 
257 |             return out
258 | 
259 | 
260 | class LogUniformSampler(object):
261 |     def __init__(self, range_max, n_sample):
262 |         """
263 |         Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
264 |             `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
265 | 
266 |         expected count can be approximated by 1 - (1 - p)^n
267 |         and we use a numerically stable version -expm1(num_tries * log1p(-p))
268 | 
269 |         Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run
270 |         """
271 |         with torch.no_grad():
272 |             self.range_max = range_max
273 |             log_indices = torch.arange(1., range_max+2., 1.).log_()
274 |             self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
275 |             # print('P', self.dist.numpy().tolist()[-30:])
276 | 
277 |             self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
278 | 
279 |         self.n_sample = n_sample
280 | 
281 |     def sample(self, labels):
282 |         """
283 |             labels: [b1, b2]
284 |         Return
285 |             true_log_probs: [b1, b2]
286 |             samp_log_probs: [n_sample]
287 |             neg_samples: [n_sample]
288 |         """
289 | 
290 |         # neg_samples = torch.empty(0).long()
291 |         n_sample = self.n_sample
292 |         n_tries = 2 * n_sample
293 | 
294 |         with torch.no_grad():
295 |             neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique()
296 |             device = labels.device
297 |             neg_samples = neg_samples.to(device)
298 |             true_log_probs = self.log_q[labels].to(device)
299 |             samp_log_probs = self.log_q[neg_samples].to(device)
300 |             return true_log_probs, samp_log_probs, neg_samples
301 | 
302 | def sample_logits(embedding, bias, labels, inputs, sampler):
303 |     """
304 |         embedding: an nn.Embedding layer
305 |         bias: [n_vocab]
306 |         labels: [b1, b2]
307 |         inputs: [b1, b2, n_emb]
308 |         sampler: you may use a LogUniformSampler
309 |     Return
310 |         logits: [b1, b2, 1 + n_sample]
311 |     """
312 |     true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels)
313 |     n_sample = neg_samples.size(0)
314 |     b1, b2 = labels.size(0), labels.size(1)
315 |     all_ids = torch.cat([labels.view(-1), neg_samples])
316 |     all_w = embedding(all_ids)
317 |     true_w = all_w[: -n_sample].view(b1, b2, -1)
318 |     sample_w = all_w[- n_sample:].view(n_sample, -1)
319 | 
320 |     all_b = bias[all_ids]
321 |     true_b = all_b[: -n_sample].view(b1, b2)
322 |     sample_b = all_b[- n_sample:]
323 | 
324 |     hit = (labels[:, :, None] == neg_samples).detach()
325 | 
326 |     true_logits = torch.einsum('ijk,ijk->ij',
327 |         [true_w, inputs]) + true_b - true_log_probs
328 |     sample_logits = torch.einsum('lk,ijk->ijl',
329 |         [sample_w, inputs]) + sample_b - samp_log_probs
330 |     sample_logits.masked_fill_(hit, -1e30)
331 |     logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
332 | 
333 |     return logits
334 | 
335 | 
336 | # class LogUniformSampler(object):
337 | #     def __init__(self, range_max, unique=False):
338 | #         """
339 | #         Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
340 | #             `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
341 | #         """
342 | #         self.range_max = range_max
343 | #         log_indices = torch.arange(1., range_max+2., 1.).log_()
344 | #         self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
345 | 
346 | #         self.unique = unique
347 | 
348 | #         if self.unique:
349 | #             self.exclude_mask = torch.ByteTensor(range_max).fill_(0)
350 | 
351 | #     def sample(self, n_sample, labels):
352 | #         pos_sample, new_labels = labels.unique(return_inverse=True)
353 | #         n_pos_sample = pos_sample.size(0)
354 | #         n_neg_sample = n_sample - n_pos_sample
355 | 
356 | #         if self.unique:
357 | #             self.exclude_mask.index_fill_(0, pos_sample, 1)
358 | #             sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)
359 | #             self.exclude_mask.index_fill_(0, pos_sample, 0)
360 | #         else:
361 | #             sample_dist = self.dist
362 | 
363 | #         neg_sample = torch.multinomial(sample_dist, n_neg_sample)
364 | 
365 | #         sample = torch.cat([pos_sample, neg_sample])
366 | #         sample_prob = self.dist[sample]
367 | 
368 | #         return new_labels, sample, sample_prob
369 | 
370 | 
371 | if __name__ == '__main__':
372 |     S, B = 3, 4
373 |     n_vocab = 10000
374 |     n_sample = 5
375 |     H = 32
376 | 
377 |     labels = torch.LongTensor(S, B).random_(0, n_vocab)
378 | 
379 |     # sampler = LogUniformSampler(n_vocab, unique=False)
380 |     # new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
381 | 
382 |     sampler = LogUniformSampler(n_vocab, n_sample)#, unique=True)
383 |     # true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
384 | 
385 |     # print('true_probs', true_probs.numpy().tolist())
386 |     # print('samp_probs', samp_probs.numpy().tolist())
387 |     # print('neg_samples', neg_samples.numpy().tolist())
388 | 
389 |     # print('sum', torch.sum(sampler.dist).item())
390 | 
391 |     # assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()
392 | 
393 |     embedding = nn.Embedding(n_vocab, H)
394 |     bias = torch.zeros(n_vocab)
395 |     inputs = torch.Tensor(S, B, H).normal_()
396 | 
397 |     logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)
398 |     print('logits', logits.detach().numpy().tolist())
399 |     print('logits shape', logits.size())
400 |     print('out_labels', out_labels.detach().numpy().tolist())
401 |     print('out_labels shape', out_labels.size())
402 | 
403 | 


--------------------------------------------------------------------------------
/sentence_similarity_Bert/examples/run_classifier_modify2.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """BERT finetuning runner."""
 17 | 
 18 | from __future__ import absolute_import, division, print_function
 19 | 
 20 | import argparse
 21 | import csv
 22 | import logging
 23 | import os
 24 | import random
 25 | import pandas as pd
 26 | import sys
 27 | 
 28 | import re
 29 | import numpy as np
 30 | import torch
 31 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 32 |                               TensorDataset)
 33 | from torch.utils.data.distributed import DistributedSampler
 34 | from tqdm import tqdm, trange
 35 | 
 36 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 37 | from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
 38 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 39 | from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
 40 | 
 41 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 42 |                     datefmt='%m/%d/%Y %H:%M:%S',
 43 |                     level=logging.INFO)
 44 | logger = logging.getLogger(__name__)
 45 | 
 46 | 
 47 | class InputExample(object):
 48 |     """A single training/test example for simple sequence classification."""
 49 | 
 50 |     def __init__(self, guid, text_a, text_b=None, label=None):
 51 |         """Constructs a InputExample.
 52 | 
 53 |         Args:
 54 |             guid: Unique id for the example.
 55 |             text_a: string. The untokenized text of the first sequence. For single
 56 |             sequence tasks, only this sequence must be specified.
 57 |             text_b: (Optional) string. The untokenized text of the second sequence.
 58 |             Only must be specified for sequence pair tasks.
 59 |             label: (Optional) string. The label of the example. This should be
 60 |             specified for train and dev examples, but not for test examples.
 61 |         """
 62 |         self.guid = guid
 63 |         self.text_a = text_a
 64 |         self.text_b = text_b
 65 |         self.label = label
 66 | 
 67 | 
 68 | class InputFeatures(object):
 69 |     """A single set of features of data."""
 70 | 
 71 |     def __init__(self, input_ids, input_mask, segment_ids, label_id):
 72 |         self.input_ids = input_ids
 73 |         self.input_mask = input_mask
 74 |         self.segment_ids = segment_ids
 75 |         self.label_id = label_id
 76 | 
 77 | 
 78 | class DataProcessor(object):
 79 |     """Base class for data converters for sequence classification data sets."""
 80 | 
 81 |     def get_train_examples(self, data_dir):
 82 |         """Gets a collection of `InputExample`s for the train set."""
 83 |         raise NotImplementedError()
 84 | 
 85 |     def get_dev_examples(self, data_dir):
 86 |         """Gets a collection of `InputExample`s for the dev set."""
 87 |         raise NotImplementedError()
 88 | 
 89 |     def get_labels(self):
 90 |         """Gets the list of labels for this data set."""
 91 |         raise NotImplementedError()
 92 | 
 93 | 
 94 | #修改处，数据读取
 95 | class SimProcessor(DataProcessor):
 96 | 
 97 |     def get_train_examples(self, data_dir):
 98 |         """See base class."""
 99 |         logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train")))
100 | 
101 |         file_path = os.path.join(data_dir, 'train.csv')
102 |         train_df = pd.read_csv(file_path, encoding='utf-8')
103 |         train_data = []
104 |         for index, train in enumerate(train_df.values):
105 |             guid = 'train-%d' % index
106 |             text_a = str(train[0])
107 |             text_b = str(train[1])
108 |             label = str(train[2])
109 |             train_data.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
110 |         return train_data
111 | 
112 | 
113 |     def get_dev_examples(self, data_dir):
114 | 
115 |         file_path = os.path.join(data_dir, 'dev.csv')
116 |         dev_df = pd.read_csv(file_path, encoding='utf-8')
117 |         dev_data = []
118 |         for index, dev in enumerate(dev_df.values):
119 |             guid = 'test-%d' % index
120 |             text_a = str(dev[0])
121 |             text_b = str(dev[1])
122 |             label = str(dev[2])
123 |             dev_data.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
124 |         return dev_data
125 |         #序号、sen1、sen2、类别
126 | 
127 | 
128 | 
129 |     #返回所有的类别
130 |     def get_labels(self):
131 |         """See base class."""
132 |         return ["0", "1"]
133 | 
134 | 
135 | def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
136 |     """Loads a data file into a list of `InputBatch`s."""
137 | 
138 |     label_map = {label: i for i, label in enumerate(label_list)}
139 |     features = []
140 |     for (ex_index, example) in enumerate(examples):
141 |         tokens_a = tokenizer.tokenize(example.text_a)
142 |         tokens_b = None
143 |         if example.text_b:
144 |             tokens_b = tokenizer.tokenize(example.text_b)
145 |             _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
146 |         else:
147 |             if len(tokens_a) > max_seq_length - 2:
148 |                 tokens_a = tokens_a[:(max_seq_length - 2)]
149 |         tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
150 |         segment_ids = [0] * len(tokens)
151 |         if tokens_b:
152 |             tokens += tokens_b + ["[SEP]"]
153 |             segment_ids += [1] * (len(tokens_b) + 1)
154 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
155 |         input_mask = [1] * len(input_ids)
156 |         padding = [0] * (max_seq_length - len(input_ids))
157 |         input_ids += padding
158 |         input_mask += padding
159 |         segment_ids += padding
160 | 
161 |         assert len(input_ids) == max_seq_length
162 |         assert len(input_mask) == max_seq_length
163 |         assert len(segment_ids) == max_seq_length
164 | 
165 |         label_id = label_map[example.label]
166 |         if ex_index < 5:
167 |             logger.info("*** Example ***")
168 |             logger.info("guid: %s" % (example.guid))
169 |             logger.info("tokens: %s" % " ".join(
170 |                 [str(x) for x in tokens]))
171 |             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
172 |             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
173 |             logger.info(
174 |                 "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
175 |             logger.info("label: %s (id = %d)" % (example.label, label_id))
176 | 
177 |         features.append(
178 |             InputFeatures(input_ids=input_ids,
179 |                           input_mask=input_mask,
180 |                           segment_ids=segment_ids,
181 |                           label_id=label_id))
182 |     return features
183 | 
184 | 
185 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
186 |     """Truncates a sequence pair in place to the maximum length."""
187 | 
188 |     # This is a simple heuristic which will always truncate the longer sequence
189 |     # one token at a time. This makes more sense than truncating an equal percent
190 |     # of tokens from each, since if one sequence is very short then each token
191 |     # that's truncated likely contains more information than a longer sequence.
192 |     while True:
193 |         total_length = len(tokens_a) + len(tokens_b)
194 |         if total_length <= max_length:
195 |             break
196 |         if len(tokens_a) > len(tokens_b):
197 |             tokens_a.pop()
198 |         else:
199 |             tokens_b.pop()
200 | 
201 | 
202 | def accuracy(out, labels):
203 |     outputs = np.argmax(out, axis=1)
204 |     return np.sum(outputs == labels)
205 | 
206 | 
207 | def main():
208 |     parser = argparse.ArgumentParser()
209 | 
210 |     ## Required parameters
211 |     parser.add_argument("--data_dir",
212 |                         default=None,
213 |                         type=str,
214 |                         required=True,
215 |                         help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
216 |     parser.add_argument("--bert_model", default=None, type=str, required=True,
217 |                         help="Bert pre-trained model selected in the list: bert-base-uncased, "
218 |                              "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
219 |                              "bert-base-multilingual-cased, bert-base-chinese.")
220 |     parser.add_argument("--task_name",
221 |                         default=None,
222 |                         type=str,
223 |                         required=True,
224 |                         help="The name of the task to train.")
225 |     parser.add_argument("--output_dir",
226 |                         default=None,
227 |                         type=str,
228 |                         required=True,
229 |                         help="The output directory where the model predictions and checkpoints will be written.")
230 | 
231 |     ## Other parameters
232 |     parser.add_argument("--cache_dir",
233 |                         default="",
234 |                         type=str,
235 |                         help="Where do you want to store the pre-trained models downloaded from s3")
236 |     parser.add_argument("--max_seq_length",
237 |                         default=128,
238 |                         type=int,
239 |                         help="The maximum total input sequence length after WordPiece tokenization. \n"
240 |                              "Sequences longer than this will be truncated, and sequences shorter \n"
241 |                              "than this will be padded.")
242 |     parser.add_argument("--do_train",
243 |                         action='store_true',
244 |                         help="Whether to run training.")
245 |     parser.add_argument("--do_eval",
246 |                         action='store_true',
247 |                         help="Whether to run eval on the dev set.")
248 |     parser.add_argument("--do_lower_case",
249 |                         action='store_true',
250 |                         help="Set this flag if you are using an uncased model.")
251 |     parser.add_argument("--train_batch_size",
252 |                         default=32,
253 |                         type=int,
254 |                         help="Total batch size for training.")
255 |     parser.add_argument("--eval_batch_size",
256 |                         default=8,
257 |                         type=int,
258 |                         help="Total batch size for eval.")
259 |     parser.add_argument("--learning_rate",
260 |                         default=5e-5,
261 |                         type=float,
262 |                         help="The initial learning rate for Adam.")
263 |     parser.add_argument("--num_train_epochs",
264 |                         default=1.0,
265 |                         type=float,
266 |                         help="Total number of training epochs to perform.")
267 |     parser.add_argument("--warmup_proportion",
268 |                         default=0.1,
269 |                         type=float,
270 |                         help="Proportion of training to perform linear learning rate warmup for. "
271 |                              "E.g., 0.1 = 10%% of training.")
272 |     parser.add_argument("--no_cuda",
273 |                         action='store_true',
274 |                         help="Whether not to use CUDA when available")
275 |     parser.add_argument("--local_rank",
276 |                         type=int,
277 |                         default=-1,
278 |                         help="local_rank for distributed training on gpus")
279 |     parser.add_argument('--seed',
280 |                         type=int,
281 |                         default=42,
282 |                         help="random seed for initialization")
283 |     parser.add_argument('--gradient_accumulation_steps',
284 |                         type=int,
285 |                         default=1,
286 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")
287 |     parser.add_argument('--fp16',
288 |                         action='store_true',
289 |                         help="Whether to use 16-bit float precision instead of 32-bit")
290 |     parser.add_argument('--loss_scale',
291 |                         type=float, default=0,
292 |                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
293 |                              "0 (default value): dynamic loss scaling.\n"
294 |                              "Positive power of 2: static loss scaling value.\n")
295 |     parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
296 |     parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
297 |     args = parser.parse_args()
298 | 
299 |     if args.server_ip and args.server_port:
300 |         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
301 |         import ptvsd
302 |         print("Waiting for debugger attach")
303 |         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
304 |         ptvsd.wait_for_attach()
305 | 
306 |     processors = {
307 |         # "cola": ColaProcessor,
308 |         # "mnli": MnliProcessor,
309 |         "mrpc" : SimProcessor
310 |     }
311 | 
312 |     num_labels_task = {
313 |         "mrpc": 2,
314 |     }
315 | 
316 |     if args.local_rank == -1 or args.no_cuda:
317 |         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
318 |         n_gpu = torch.cuda.device_count()
319 |     else:
320 |         torch.cuda.set_device(args.local_rank)
321 |         device = torch.device("cuda", args.local_rank)
322 |         n_gpu = 1
323 |         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
324 |         torch.distributed.init_process_group(backend='nccl')
325 |     logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
326 |         device, n_gpu, bool(args.local_rank != -1), args.fp16))
327 | 
328 |     if args.gradient_accumulation_steps < 1:
329 |         raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
330 |             args.gradient_accumulation_steps))
331 | 
332 |     args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
333 | 
334 |     random.seed(args.seed)
335 |     np.random.seed(args.seed)
336 |     torch.manual_seed(args.seed)
337 |     if n_gpu > 0:
338 |         torch.cuda.manual_seed_all(args.seed)
339 | 
340 |     if not args.do_train and not args.do_eval:
341 |         raise ValueError("At least one of `do_train` or `do_eval` must be True.")
342 | 
343 |     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
344 |         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
345 |     if not os.path.exists(args.output_dir):
346 |         os.makedirs(args.output_dir)
347 | 
348 |     task_name = args.task_name.lower()
349 | 
350 |     if task_name not in processors:
351 |         raise ValueError("Task not found: %s" % (task_name))
352 | 
353 |     processor = processors[task_name]()
354 |     num_labels = num_labels_task[task_name]
355 |     label_list = processor.get_labels()
356 | 
357 |     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
358 | 
359 |     train_examples = None
360 |     num_train_optimization_steps = None
361 |     if args.do_train:
362 |         train_examples = processor.get_train_examples(args.data_dir)
363 |         num_train_optimization_steps = int(
364 |             len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
365 |         if args.local_rank != -1:
366 |             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
367 | 
368 |     # Prepare model
369 |     cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{0}')
370 |     # cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(str(args.local_rank)))
371 |     model = BertForSequenceClassification.from_pretrained(args.bert_model,
372 |                                                           cache_dir=cache_dir,
373 |                                                           num_labels=num_labels)
374 |     if args.fp16:
375 |         model.half()
376 |     model.to(device)
377 |     if args.local_rank != -1:
378 |         try:
379 |             from apex.parallel import DistributedDataParallel as DDP
380 |         except ImportError:
381 |             raise ImportError(
382 |                 "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
383 | 
384 |         model = DDP(model)
385 |     elif n_gpu > 1:
386 |         model = torch.nn.DataParallel(model)
387 | 
388 |     # Prepare optimizer
389 |     param_optimizer = list(model.named_parameters())
390 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
391 |     optimizer_grouped_parameters = [
392 |         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
393 |         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
394 |     ]
395 |     if args.fp16:
396 |         try:
397 |             from apex.optimizers import FP16_Optimizer
398 |             from apex.optimizers import FusedAdam
399 |         except ImportError:
400 |             raise ImportError(
401 |                 "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
402 | 
403 |         optimizer = FusedAdam(optimizer_grouped_parameters,
404 |                               lr=args.learning_rate,
405 |                               bias_correction=False,
406 |                               max_grad_norm=1.0)
407 |         if args.loss_scale == 0:
408 |             optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
409 |         else:
410 |             optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
411 | 
412 |     else:
413 |         optimizer = BertAdam(optimizer_grouped_parameters,
414 |                              lr=args.learning_rate,
415 |                              warmup=args.warmup_proportion,
416 |                              t_total=num_train_optimization_steps)
417 | 
418 |     global_step = 0
419 |     nb_tr_steps = 0
420 |     tr_loss = 0
421 |     if args.do_train:
422 |         train_features = convert_examples_to_features(
423 |             train_examples, label_list, args.max_seq_length, tokenizer)
424 |         logger.info("***** Running training *****")
425 |         logger.info("  Num examples = %d", len(train_examples))
426 |         logger.info("  Batch size = %d", args.train_batch_size)
427 |         logger.info("  Num steps = %d", num_train_optimization_steps)
428 |         all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
429 |         all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
430 |         all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
431 |         all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
432 |         train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
433 |         if args.local_rank == -1:
434 |             train_sampler = RandomSampler(train_data)
435 |         else:
436 |             train_sampler = DistributedSampler(train_data)
437 |         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
438 | 
439 |         model.train()
440 |         for _ in trange(int(args.num_train_epochs), desc="Epoch"):
441 |             tr_loss = 0
442 |             nb_tr_examples, nb_tr_steps = 0, 0
443 |             for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
444 |                 batch = tuple(t.to(device) for t in batch)
445 |                 input_ids, input_mask, segment_ids, label_ids = batch
446 |                 loss = model(input_ids, segment_ids, input_mask, label_ids)
447 |                 if n_gpu > 1:
448 |                     loss = loss.mean()  # mean() to average on multi-gpu.
449 |                 if args.gradient_accumulation_steps > 1:
450 |                     loss = loss / args.gradient_accumulation_steps
451 | 
452 |                 if args.fp16:
453 |                     optimizer.backward(loss)
454 |                 else:
455 |                     loss.backward()
456 | 
457 |                 tr_loss += loss.item()
458 |                 nb_tr_examples += input_ids.size(0)
459 |                 nb_tr_steps += 1
460 |                 if (step + 1) % args.gradient_accumulation_steps == 0:
461 |                     if args.fp16:
462 |                         # modify learning rate with special warm up BERT uses
463 |                         # if args.fp16 is False, BertAdam is used that handles this automatically
464 |                         lr_this_step = args.learning_rate * warmup_linear(global_step / num_train_optimization_steps,
465 |                                                                           args.warmup_proportion)
466 |                         for param_group in optimizer.param_groups:
467 |                             param_group['lr'] = lr_this_step
468 |                     optimizer.step()
469 |                     optimizer.zero_grad()
470 |                     global_step += 1
471 | 
472 |     if args.do_train:
473 |         # Save a trained model and the associated configuration
474 |         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
475 |         output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
476 |         torch.save(model_to_save.state_dict(), output_model_file)
477 |         output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
478 |         with open(output_config_file, 'w') as f:
479 |             f.write(model_to_save.config.to_json_string())
480 | 
481 |         # Load a trained model and config that you have fine-tuned
482 |         config = BertConfig(output_config_file)
483 |         model = BertForSequenceClassification(config, num_labels=num_labels)
484 |         model.load_state_dict(torch.load(output_model_file))
485 |     else:
486 |         model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
487 |     model.to(device)
488 | 
489 |     if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
490 |         eval_examples = processor.get_dev_examples(args.data_dir)
491 |         eval_features = convert_examples_to_features(
492 |             eval_examples, label_list, args.max_seq_length, tokenizer)
493 |         logger.info("***** Running evaluation *****")
494 |         logger.info("  Num examples = %d", len(eval_examples))
495 |         logger.info("  Batch size = %d", args.eval_batch_size)
496 |         all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
497 |         all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
498 |         all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
499 |         all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
500 |         eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
501 |         # Run prediction for full data
502 |         eval_sampler = SequentialSampler(eval_data)
503 |         eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
504 | 
505 |         model.eval()
506 |         eval_loss, eval_accuracy = 0, 0
507 |         nb_eval_steps, nb_eval_examples = 0, 0
508 | 
509 |         for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
510 |             input_ids = input_ids.to(device)
511 |             input_mask = input_mask.to(device)
512 |             segment_ids = segment_ids.to(device)
513 |             label_ids = label_ids.to(device)
514 | 
515 |             with torch.no_grad():
516 |                 tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
517 |                 logits = model(input_ids, segment_ids, input_mask)
518 | 
519 |             logits = logits.detach().cpu().numpy()
520 |             label_ids = label_ids.to('cpu').numpy()
521 |             tmp_eval_accuracy = accuracy(logits, label_ids)
522 | 
523 |             eval_loss += tmp_eval_loss.mean().item()
524 |             eval_accuracy += tmp_eval_accuracy
525 | 
526 |             nb_eval_examples += input_ids.size(0)
527 |             nb_eval_steps += 1
528 | 
529 |         eval_loss = eval_loss / nb_eval_steps
530 |         eval_accuracy = eval_accuracy / nb_eval_examples
531 |         loss = tr_loss / nb_tr_steps if args.do_train else None
532 |         result = {'eval_loss': eval_loss,
533 |                   'eval_accuracy': eval_accuracy,
534 |                   'global_step': global_step,
535 |                   'loss': loss}
536 | 
537 |         output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
538 |         # with open(output_eval_file, "w") as writer:
539 |         #     logger.info("***** Eval results *****")
540 |         #     for key in sorted(result.keys()):
541 |         #         logger.info("  %s = %s", key, str(result[key]))
542 |         #         writer.write("%s = %s\n" % (key, str(result[key])))
543 | 
544 | 
545 | if __name__ == "__main__":
546 |     main()
547 | 


--------------------------------------------------------------------------------