├── data
    ├── data
    │   └── README
    ├── README
    └── model
    │   └── README
├── code
    ├── requirements.txt
    ├── pytorch_pretrained_bert
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── convert_tf_checkpoint_to_pytorch.py
    │   ├── optimization.py
    │   ├── file_utils.py
    │   ├── tokenization.py
    │   └── modeling.py
    ├── setup.py
    ├── LICENSE
    ├── run_arc.py
    └── README.md
├── LISCENSE.txt
├── README.md
└── TPN


/data/data/README:
--------------------------------------------------------------------------------
1 | put your data here
2 | 


--------------------------------------------------------------------------------
/data/README:
--------------------------------------------------------------------------------
1 | put your model and data folder here
2 | 


--------------------------------------------------------------------------------
/data/model/README:
--------------------------------------------------------------------------------
1 | put pre-trained model from google here
2 | 


--------------------------------------------------------------------------------
/code/requirements.txt:
--------------------------------------------------------------------------------
1 | # PyTorch
2 | torch>=0.4.1
3 | # progress bars in model download and training scripts
4 | tqdm
5 | # Accessing files from S3 directly.
6 | boto3
7 | # Used for downloading models over HTTP
8 | requests


--------------------------------------------------------------------------------
/code/pytorch_pretrained_bert/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.4.0"
2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
3 | from .modeling import (BertConfig, BertModel, BertForPreTraining,
4 |                        BertForMaskedLM, BertForNextSentencePrediction,
5 |                        BertForSequenceClassification, BertForMultipleChoice,
6 |                        BertForTokenClassification, BertForQuestionAnswering)
7 | from .optimization import BertAdam
8 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
9 | 


--------------------------------------------------------------------------------
/code/pytorch_pretrained_bert/__main__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | if __name__ == '__main__':
 3 |     import sys
 4 |     try:
 5 |         from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
 6 |     except ModuleNotFoundError:
 7 |         print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
 8 |               "In that case, it requires TensorFlow to be installed. Please see "
 9 |               "https://www.tensorflow.org/install/ for installation instructions.")
10 |         raise
11 | 
12 |     if len(sys.argv) != 5:
13 |         # pylint: disable=line-too-long
14 |         print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
15 |     else:
16 |         PYTORCH_DUMP_OUTPUT = sys.argv.pop()
17 |         TF_CONFIG = sys.argv.pop()
18 |         TF_CHECKPOINT = sys.argv.pop()
19 |         convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
20 | 


--------------------------------------------------------------------------------
/LISCENSE.txt:
--------------------------------------------------------------------------------
 1 | ------------------------------------------- START OF LICENSE -----------------------------------------
 2 | ARC-Challenge
 3 | Copyright (c) Microsoft CorporationAll rights reserved.MIT LicensePermission is hereby granted, free of charge, to any person obtaining a 
 4 | copy of this software and associated documentation files (the Software), to deal in the Software without restriction, including without 
 5 | limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit 
 6 | persons to whom the Software is furnished to do so, subject to the following conditions:The above copyright notice and this permission 
 7 | notice shall be included in all copies or substantial portions of the Software.THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY
 8 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
 9 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
10 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11 | ----------------------------------------------- END OF LICENSE ------------------------------------------
12 | 
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ARC-Challenge Baseline
 2 | This repo describes how to get the baseline score for ARC challenge dataset using BERT
 3 | 
 4 | The major part of code is obtained from https://github.com/huggingface/pytorch-pretrained-BERT, and run_arc.py is modified based on run_swag.py in the repo. 
 5 | 
 6 | ## Pre-trained on RACE
 7 | The first step is to pre-train the model on RACE (**both** M and H), and select epoch and other hyper parameters (step size and batch size) by a **merged** RACE validation dataset. After this step, the BERT-large model should have ~67% (accuracy) on RACE-H test and ~40% on ARC-Challenge test, and BERT-base should have ~63% on RACE-H test and ~36% on ARC-Challenge test.
 8 | 
 9 | ## Finetune on ARC
10 | In this step, you need to train on **merged** ARC-Easy and ARC-challenge based on the previous pre-trained model, and select hyper-parameters based on the **merged** ARC-Easy and ARC-Challenge validation dataset (selected model has 58% or so accuracy on this merged validation data). After this step, the single model performance on ARC-Challenge test is around 44-45% and 68-69% on ARC-Easy. Performance could be further boosted by ensembling more models. 
11 | 
12 | ## Dataset
13 | Race dataset could be obtained following this link https://github.com/qizhex/RACE_AR_baselines. 
14 | For ARC dataset, we are using the method in https://arxiv.org/abs/1808.09492 to prepare passages.
15 | Also please refer to the data folder for how to place data and pretrained model to make the code work, it is organized this way mainly for training on Microsoft server purpose.
16 | 
17 | 


--------------------------------------------------------------------------------
/code/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py
 3 | 
 4 | To create the package for pypi.
 5 | 
 6 | 1. Change the version in __init__.py and setup.py.
 7 | 
 8 | 2. Commit these changes with the message: "Release: VERSION"
 9 | 
10 | 3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' "
11 |    Push the tag to git: git push --tags origin master
12 | 
13 | 4. Build both the sources and the wheel. Do not change anything in setup.py between
14 |    creating the wheel and the source distribution (obviously).
15 | 
16 |    For the wheel, run: "python setup.py bdist_wheel" in the top level allennlp directory.
17 |    (this will build a wheel for the python version you use to build it - make sure you use python 3.x).
18 | 
19 |    For the sources, run: "python setup.py sdist"
20 |    You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp.
21 | 
22 | 5. Check that everything looks correct by uploading the package to the pypi test server:
23 | 
24 |    twine upload dist/* -r pypitest
25 |    (pypi suggest using twine as other methods upload files via plaintext.)
26 | 
27 |    Check that you can install it in a virtualenv by running:
28 |    pip install -i https://testpypi.python.org/pypi allennlp
29 | 
30 | 6. Upload the final version to actual pypi:
31 |    twine upload dist/* -r pypi
32 | 
33 | 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
34 | 
35 | """
36 | from setuptools import find_packages, setup
37 | 
38 | setup(
39 |     name="pytorch_pretrained_bert",
40 |     version="0.4.0",
41 |     author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors",
42 |     author_email="thomas@huggingface.co",
43 |     description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
44 |     long_description=open("README.md", "r", encoding='utf-8').read(),
45 |     long_description_content_type="text/markdown",
46 |     keywords='BERT NLP deep learning google',
47 |     license='Apache',
48 |     url="https://github.com/huggingface/pytorch-pretrained-BERT",
49 |     packages=find_packages(exclude=["*.tests", "*.tests.*",
50 |                                     "tests.*", "tests"]),
51 |     install_requires=['torch>=0.4.1',
52 |                       'numpy',
53 |                       'boto3',
54 |                       'requests',
55 |                       'tqdm'],
56 |     scripts=["bin/pytorch_pretrained_bert"],
57 |     python_requires='>=3.5.0',
58 |     tests_require=['pytest'],
59 |     classifiers=[
60 |           'Intended Audience :: Science/Research',
61 |           'License :: OSI Approved :: Apache Software License',
62 |           'Programming Language :: Python :: 3',
63 |           'Topic :: Scientific/Engineering :: Artificial Intelligence',
64 |     ],
65 | )
66 | 


--------------------------------------------------------------------------------
/code/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert BERT checkpoint."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import os
 22 | import re
 23 | import argparse
 24 | import tensorflow as tf
 25 | import torch
 26 | import numpy as np
 27 | 
 28 | from .modeling import BertConfig, BertForPreTraining
 29 | 
 30 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
 31 |     config_path = os.path.abspath(bert_config_file)
 32 |     tf_path = os.path.abspath(tf_checkpoint_path)
 33 |     print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path))
 34 |     # Load weights from TF model
 35 |     init_vars = tf.train.list_variables(tf_path)
 36 |     names = []
 37 |     arrays = []
 38 |     for name, shape in init_vars:
 39 |         print("Loading TF weight {} with shape {}".format(name, shape))
 40 |         array = tf.train.load_variable(tf_path, name)
 41 |         names.append(name)
 42 |         arrays.append(array)
 43 | 
 44 |     # Initialise PyTorch model
 45 |     config = BertConfig.from_json_file(bert_config_file)
 46 |     print("Building PyTorch model from configuration: {}".format(str(config)))
 47 |     model = BertForPreTraining(config)
 48 | 
 49 |     for name, array in zip(names, arrays):
 50 |         name = name.split('/')
 51 |         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
 52 |         # which are not required for using pretrained model
 53 |         if any(n in ["adam_v", "adam_m"] for n in name):
 54 |             print("Skipping {}".format("/".join(name)))
 55 |             continue
 56 |         pointer = model
 57 |         for m_name in name:
 58 |             if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
 59 |                 l = re.split(r'_(\d+)', m_name)
 60 |             else:
 61 |                 l = [m_name]
 62 |             if l[0] == 'kernel' or l[0] == 'gamma':
 63 |                 pointer = getattr(pointer, 'weight')
 64 |             elif l[0] == 'output_bias' or l[0] == 'beta':
 65 |                 pointer = getattr(pointer, 'bias')
 66 |             elif l[0] == 'output_weights':
 67 |                 pointer = getattr(pointer, 'weight')
 68 |             else:
 69 |                 pointer = getattr(pointer, l[0])
 70 |             if len(l) >= 2:
 71 |                 num = int(l[1])
 72 |                 pointer = pointer[num]
 73 |         if m_name[-11:] == '_embeddings':
 74 |             pointer = getattr(pointer, 'weight')
 75 |         elif m_name == 'kernel':
 76 |             array = np.transpose(array)
 77 |         try:
 78 |             assert pointer.shape == array.shape
 79 |         except AssertionError as e:
 80 |             e.args += (pointer.shape, array.shape)
 81 |             raise
 82 |         print("Initialize PyTorch weight {}".format(name))
 83 |         pointer.data = torch.from_numpy(array)
 84 | 
 85 |     # Save pytorch-model
 86 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
 87 |     torch.save(model.state_dict(), pytorch_dump_path)
 88 | 
 89 | 
 90 | if __name__ == "__main__":
 91 |     parser = argparse.ArgumentParser()
 92 |     ## Required parameters
 93 |     parser.add_argument("--tf_checkpoint_path",
 94 |                         default = None,
 95 |                         type = str,
 96 |                         required = True,
 97 |                         help = "Path the TensorFlow checkpoint path.")
 98 |     parser.add_argument("--bert_config_file",
 99 |                         default = None,
100 |                         type = str,
101 |                         required = True,
102 |                         help = "The config json file corresponding to the pre-trained BERT model. \n"
103 |                             "This specifies the model architecture.")
104 |     parser.add_argument("--pytorch_dump_path",
105 |                         default = None,
106 |                         type = str,
107 |                         required = True,
108 |                         help = "Path to the output PyTorch model.")
109 |     args = parser.parse_args()
110 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
111 |                                      args.bert_config_file,
112 |                                      args.pytorch_dump_path)
113 | 


--------------------------------------------------------------------------------
/code/pytorch_pretrained_bert/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """PyTorch optimization for BERT model."""
 16 | 
 17 | import math
 18 | import torch
 19 | from torch.optim import Optimizer
 20 | from torch.optim.optimizer import required
 21 | from torch.nn.utils import clip_grad_norm_
 22 | 
 23 | def warmup_cosine(x, warmup=0.002):
 24 |     if x < warmup:
 25 |         return x/warmup
 26 |     return 0.5 * (1.0 + torch.cos(math.pi * x))
 27 | 
 28 | def warmup_constant(x, warmup=0.002):
 29 |     if x < warmup:
 30 |         return x/warmup
 31 |     return 1.0
 32 | 
 33 | def warmup_linear(x, warmup=0.002):
 34 |     if x < warmup:
 35 |         return x/warmup
 36 |     return 1.0 - x
 37 | 
 38 | SCHEDULES = {
 39 |     'warmup_cosine':warmup_cosine,
 40 |     'warmup_constant':warmup_constant,
 41 |     'warmup_linear':warmup_linear,
 42 | }
 43 | 
 44 | 
 45 | class BertAdam(Optimizer):
 46 |     """Implements BERT version of Adam algorithm with weight decay fix.
 47 |     Params:
 48 |         lr: learning rate
 49 |         warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
 50 |         t_total: total number of training steps for the learning
 51 |             rate schedule, -1  means constant learning rate. Default: -1
 52 |         schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
 53 |         b1: Adams b1. Default: 0.9
 54 |         b2: Adams b2. Default: 0.999
 55 |         e: Adams epsilon. Default: 1e-6
 56 |         weight_decay: Weight decay. Default: 0.01
 57 |         max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
 58 |     """
 59 |     def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
 60 |                  b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
 61 |                  max_grad_norm=1.0):
 62 |         if lr is not required and lr < 0.0:
 63 |             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
 64 |         if schedule not in SCHEDULES:
 65 |             raise ValueError("Invalid schedule parameter: {}".format(schedule))
 66 |         if not 0.0 <= warmup < 1.0 and not warmup == -1:
 67 |             raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
 68 |         if not 0.0 <= b1 < 1.0:
 69 |             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
 70 |         if not 0.0 <= b2 < 1.0:
 71 |             raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
 72 |         if not e >= 0.0:
 73 |             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
 74 |         defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
 75 |                         b1=b1, b2=b2, e=e, weight_decay=weight_decay,
 76 |                         max_grad_norm=max_grad_norm)
 77 |         super(BertAdam, self).__init__(params, defaults)
 78 | 
 79 |     def get_lr(self):
 80 |         lr = []
 81 |         for group in self.param_groups:
 82 |             for p in group['params']:
 83 |                 state = self.state[p]
 84 |                 if len(state) == 0:
 85 |                     return [0]
 86 |                 if group['t_total'] != -1:
 87 |                     schedule_fct = SCHEDULES[group['schedule']]
 88 |                     lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
 89 |                 else:
 90 |                     lr_scheduled = group['lr']
 91 |                 lr.append(lr_scheduled)
 92 |         return lr
 93 | 
 94 |     def step(self, closure=None):
 95 |         """Performs a single optimization step.
 96 | 
 97 |         Arguments:
 98 |             closure (callable, optional): A closure that reevaluates the model
 99 |                 and returns the loss.
100 |         """
101 |         loss = None
102 |         if closure is not None:
103 |             loss = closure()
104 | 
105 |         for group in self.param_groups:
106 |             for p in group['params']:
107 |                 if p.grad is None:
108 |                     continue
109 |                 grad = p.grad.data
110 |                 if grad.is_sparse:
111 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
112 | 
113 |                 state = self.state[p]
114 | 
115 |                 # State initialization
116 |                 if len(state) == 0:
117 |                     state['step'] = 0
118 |                     # Exponential moving average of gradient values
119 |                     state['next_m'] = torch.zeros_like(p.data)
120 |                     # Exponential moving average of squared gradient values
121 |                     state['next_v'] = torch.zeros_like(p.data)
122 | 
123 |                 next_m, next_v = state['next_m'], state['next_v']
124 |                 beta1, beta2 = group['b1'], group['b2']
125 | 
126 |                 # Add grad clipping
127 |                 if group['max_grad_norm'] > 0:
128 |                     clip_grad_norm_(p, group['max_grad_norm'])
129 | 
130 |                 # Decay the first and second moment running average coefficient
131 |                 # In-place operations to update the averages at the same time
132 |                 next_m.mul_(beta1).add_(1 - beta1, grad)
133 |                 next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
134 |                 update = next_m / (next_v.sqrt() + group['e'])
135 | 
136 |                 # Just adding the square of the weights to the loss function is *not*
137 |                 # the correct way of using L2 regularization/weight decay with Adam,
138 |                 # since that will interact with the m and v parameters in strange ways.
139 |                 #
140 |                 # Instead we want to decay the weights in a manner that doesn't interact
141 |                 # with the m/v parameters. This is equivalent to adding the square
142 |                 # of the weights to the loss with plain (non-momentum) SGD.
143 |                 if group['weight_decay'] > 0.0:
144 |                     update += group['weight_decay'] * p.data
145 | 
146 |                 if group['t_total'] != -1:
147 |                     schedule_fct = SCHEDULES[group['schedule']]
148 |                     lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
149 |                 else:
150 |                     lr_scheduled = group['lr']
151 | 
152 |                 update_with_lr = lr_scheduled * update
153 |                 p.data.add_(-update_with_lr)
154 | 
155 |                 state['step'] += 1
156 | 
157 |                 # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
158 |                 # No bias correction
159 |                 # bias_correction1 = 1 - beta1 ** state['step']
160 |                 # bias_correction2 = 1 - beta2 ** state['step']
161 | 
162 |         return loss
163 | 


--------------------------------------------------------------------------------
/code/pytorch_pretrained_bert/file_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for working with the local dataset cache.
  3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
  4 | Copyright by the AllenNLP authors.
  5 | """
  6 | 
  7 | import os
  8 | import logging
  9 | import shutil
 10 | import tempfile
 11 | import json
 12 | from urllib.parse import urlparse
 13 | from pathlib import Path
 14 | from typing import Optional, Tuple, Union, IO, Callable, Set
 15 | from hashlib import sha256
 16 | from functools import wraps
 17 | 
 18 | from tqdm import tqdm
 19 | 
 20 | import boto3
 21 | from botocore.exceptions import ClientError
 22 | import requests
 23 | 
 24 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 25 | 
 26 | PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
 27 |                                                Path.home() / '.pytorch_pretrained_bert'))
 28 | 
 29 | 
 30 | def url_to_filename(url: str, etag: str = None) -> str:
 31 |     """
 32 |     Convert `url` into a hashed filename in a repeatable way.
 33 |     If `etag` is specified, append its hash to the url's, delimited
 34 |     by a period.
 35 |     """
 36 |     url_bytes = url.encode('utf-8')
 37 |     url_hash = sha256(url_bytes)
 38 |     filename = url_hash.hexdigest()
 39 | 
 40 |     if etag:
 41 |         etag_bytes = etag.encode('utf-8')
 42 |         etag_hash = sha256(etag_bytes)
 43 |         filename += '.' + etag_hash.hexdigest()
 44 | 
 45 |     return filename
 46 | 
 47 | 
 48 | def filename_to_url(filename: str, cache_dir: Union[str, Path] = None) -> Tuple[str, str]:
 49 |     """
 50 |     Return the url and etag (which may be ``None``) stored for `filename`.
 51 |     Raise ``FileNotFoundError`` if `filename` or its stored metadata do not exist.
 52 |     """
 53 |     if cache_dir is None:
 54 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
 55 |     if isinstance(cache_dir, Path):
 56 |         cache_dir = str(cache_dir)
 57 | 
 58 |     cache_path = os.path.join(cache_dir, filename)
 59 |     if not os.path.exists(cache_path):
 60 |         raise FileNotFoundError("file {} not found".format(cache_path))
 61 | 
 62 |     meta_path = cache_path + '.json'
 63 |     if not os.path.exists(meta_path):
 64 |         raise FileNotFoundError("file {} not found".format(meta_path))
 65 | 
 66 |     with open(meta_path) as meta_file:
 67 |         metadata = json.load(meta_file)
 68 |     url = metadata['url']
 69 |     etag = metadata['etag']
 70 | 
 71 |     return url, etag
 72 | 
 73 | 
 74 | def cached_path(url_or_filename: Union[str, Path], cache_dir: Union[str, Path] = None) -> str:
 75 |     """
 76 |     Given something that might be a URL (or might be a local path),
 77 |     determine which. If it's a URL, download the file and cache it, and
 78 |     return the path to the cached file. If it's already a local path,
 79 |     make sure the file exists and then return the path.
 80 |     """
 81 |     if cache_dir is None:
 82 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
 83 |     if isinstance(url_or_filename, Path):
 84 |         url_or_filename = str(url_or_filename)
 85 |     if isinstance(cache_dir, Path):
 86 |         cache_dir = str(cache_dir)
 87 | 
 88 |     parsed = urlparse(url_or_filename)
 89 | 
 90 |     if parsed.scheme in ('http', 'https', 's3'):
 91 |         # URL, so get it from the cache (downloading if necessary)
 92 |         return get_from_cache(url_or_filename, cache_dir)
 93 |     elif os.path.exists(url_or_filename):
 94 |         # File, and it exists.
 95 |         return url_or_filename
 96 |     elif parsed.scheme == '':
 97 |         # File, but it doesn't exist.
 98 |         raise FileNotFoundError("file {} not found".format(url_or_filename))
 99 |     else:
100 |         # Something unknown
101 |         raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
102 | 
103 | 
104 | def split_s3_path(url: str) -> Tuple[str, str]:
105 |     """Split a full s3 path into the bucket name and path."""
106 |     parsed = urlparse(url)
107 |     if not parsed.netloc or not parsed.path:
108 |         raise ValueError("bad s3 path {}".format(url))
109 |     bucket_name = parsed.netloc
110 |     s3_path = parsed.path
111 |     # Remove '/' at beginning of path.
112 |     if s3_path.startswith("/"):
113 |         s3_path = s3_path[1:]
114 |     return bucket_name, s3_path
115 | 
116 | 
117 | def s3_request(func: Callable):
118 |     """
119 |     Wrapper function for s3 requests in order to create more helpful error
120 |     messages.
121 |     """
122 | 
123 |     @wraps(func)
124 |     def wrapper(url: str, *args, **kwargs):
125 |         try:
126 |             return func(url, *args, **kwargs)
127 |         except ClientError as exc:
128 |             if int(exc.response["Error"]["Code"]) == 404:
129 |                 raise FileNotFoundError("file {} not found".format(url))
130 |             else:
131 |                 raise
132 | 
133 |     return wrapper
134 | 
135 | 
136 | @s3_request
137 | def s3_etag(url: str) -> Optional[str]:
138 |     """Check ETag on S3 object."""
139 |     s3_resource = boto3.resource("s3")
140 |     bucket_name, s3_path = split_s3_path(url)
141 |     s3_object = s3_resource.Object(bucket_name, s3_path)
142 |     return s3_object.e_tag
143 | 
144 | 
145 | @s3_request
146 | def s3_get(url: str, temp_file: IO) -> None:
147 |     """Pull a file directly from S3."""
148 |     s3_resource = boto3.resource("s3")
149 |     bucket_name, s3_path = split_s3_path(url)
150 |     s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
151 | 
152 | 
153 | def http_get(url: str, temp_file: IO) -> None:
154 |     req = requests.get(url, stream=True)
155 |     content_length = req.headers.get('Content-Length')
156 |     total = int(content_length) if content_length is not None else None
157 |     progress = tqdm(unit="B", total=total)
158 |     for chunk in req.iter_content(chunk_size=1024):
159 |         if chunk: # filter out keep-alive new chunks
160 |             progress.update(len(chunk))
161 |             temp_file.write(chunk)
162 |     progress.close()
163 | 
164 | 
165 | def get_from_cache(url: str, cache_dir: Union[str, Path] = None) -> str:
166 |     """
167 |     Given a URL, look for the corresponding dataset in the local cache.
168 |     If it's not there, download it. Then return the path to the cached file.
169 |     """
170 |     if cache_dir is None:
171 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
172 |     if isinstance(cache_dir, Path):
173 |         cache_dir = str(cache_dir)
174 | 
175 |     os.makedirs(cache_dir, exist_ok=True)
176 | 
177 |     # Get eTag to add to filename, if it exists.
178 |     if url.startswith("s3://"):
179 |         etag = s3_etag(url)
180 |     else:
181 |         response = requests.head(url, allow_redirects=True)
182 |         if response.status_code != 200:
183 |             raise IOError("HEAD request failed for url {} with status code {}"
184 |                           .format(url, response.status_code))
185 |         etag = response.headers.get("ETag")
186 | 
187 |     filename = url_to_filename(url, etag)
188 | 
189 |     # get cache path to put the file
190 |     cache_path = os.path.join(cache_dir, filename)
191 | 
192 |     if not os.path.exists(cache_path):
193 |         # Download to temporary file, then copy to cache dir once finished.
194 |         # Otherwise you get corrupt cache entries if the download gets interrupted.
195 |         with tempfile.NamedTemporaryFile() as temp_file:
196 |             logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
197 | 
198 |             # GET file object
199 |             if url.startswith("s3://"):
200 |                 s3_get(url, temp_file)
201 |             else:
202 |                 http_get(url, temp_file)
203 | 
204 |             # we are copying the file before closing it, so flush to avoid truncation
205 |             temp_file.flush()
206 |             # shutil.copyfileobj() starts at the current position, so go to the start
207 |             temp_file.seek(0)
208 | 
209 |             logger.info("copying %s to cache at %s", temp_file.name, cache_path)
210 |             with open(cache_path, 'wb') as cache_file:
211 |                 shutil.copyfileobj(temp_file, cache_file)
212 | 
213 |             logger.info("creating metadata file for %s", cache_path)
214 |             meta = {'url': url, 'etag': etag}
215 |             meta_path = cache_path + '.json'
216 |             with open(meta_path, 'w') as meta_file:
217 |                 json.dump(meta, meta_file)
218 | 
219 |             logger.info("removing temp file %s", temp_file.name)
220 | 
221 |     return cache_path
222 | 
223 | 
224 | def read_set_from_file(filename: str) -> Set[str]:
225 |     '''
226 |     Extract a de-duped collection (set) of text from a file.
227 |     Expected file format is one item per line.
228 |     '''
229 |     collection = set()
230 |     with open(filename, 'r', encoding='utf-8') as file_:
231 |         for line in file_:
232 |             collection.add(line.rstrip())
233 |     return collection
234 | 
235 | 
236 | def get_file_extension(path: str, dot=True, lower: bool = True):
237 |     ext = os.path.splitext(path)[1]
238 |     ext = ext if dot else ext[1:]
239 |     return ext.lower() if lower else ext
240 | 


--------------------------------------------------------------------------------
/code/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/TPN:
--------------------------------------------------------------------------------
  1 | NOTICES AND INFORMATION
  2 | Do Not Translate or Localize
  3 | 
  4 | This software incorporates material from third parties. Microsoft makes certain
  5 | open source code available at http://3rdpartysource.microsoft.com, or you may
  6 | send a check or money order for US $5.00, including the product name, the open
  7 | source component name, and version number, to:
  8 | 
  9 | Source Code Compliance Team
 10 | Microsoft Corporation
 11 | One Microsoft Way
 12 | Redmond, WA 98052
 13 | USA
 14 | 
 15 | Notwithstanding any other terms, you may reverse engineer this software to the
 16 | extent required to debug changes to any libraries licensed under the GNU Lesser
 17 | General Public License.
 18 | 
 19 | Component. https://github.com/huggingface/pytorch-pretrained-BERT
 20 | 
 21 | Open Source License/Copyright Notice. 
 22 |         Apache License
 23 |                            Version 2.0, January 2004
 24 |                         http://www.apache.org/licenses/
 25 | 
 26 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 27 | 
 28 |    1. Definitions.
 29 | 
 30 |       "License" shall mean the terms and conditions for use, reproduction,
 31 |       and distribution as defined by Sections 1 through 9 of this document.
 32 | 
 33 |       "Licensor" shall mean the copyright owner or entity authorized by
 34 |       the copyright owner that is granting the License.
 35 | 
 36 |       "Legal Entity" shall mean the union of the acting entity and all
 37 |       other entities that control, are controlled by, or are under common
 38 |       control with that entity. For the purposes of this definition,
 39 |       "control" means (i) the power, direct or indirect, to cause the
 40 |       direction or management of such entity, whether by contract or
 41 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 42 |       outstanding shares, or (iii) beneficial ownership of such entity.
 43 | 
 44 |       "You" (or "Your") shall mean an individual or Legal Entity
 45 |       exercising permissions granted by this License.
 46 | 
 47 |       "Source" form shall mean the preferred form for making modifications,
 48 |       including but not limited to software source code, documentation
 49 |       source, and configuration files.
 50 | 
 51 |       "Object" form shall mean any form resulting from mechanical
 52 |       transformation or translation of a Source form, including but
 53 |       not limited to compiled object code, generated documentation,
 54 |       and conversions to other media types.
 55 | 
 56 |       "Work" shall mean the work of authorship, whether in Source or
 57 |       Object form, made available under the License, as indicated by a
 58 |       copyright notice that is included in or attached to the work
 59 |       (an example is provided in the Appendix below).
 60 | 
 61 |       "Derivative Works" shall mean any work, whether in Source or Object
 62 |       form, that is based on (or derived from) the Work and for which the
 63 |       editorial revisions, annotations, elaborations, or other modifications
 64 |       represent, as a whole, an original work of authorship. For the purposes
 65 |       of this License, Derivative Works shall not include works that remain
 66 |       separable from, or merely link (or bind by name) to the interfaces of,
 67 |       the Work and Derivative Works thereof.
 68 | 
 69 |       "Contribution" shall mean any work of authorship, including
 70 |       the original version of the Work and any modifications or additions
 71 |       to that Work or Derivative Works thereof, that is intentionally
 72 |       submitted to Licensor for inclusion in the Work by the copyright owner
 73 |       or by an individual or Legal Entity authorized to submit on behalf of
 74 |       the copyright owner. For the purposes of this definition, "submitted"
 75 |       means any form of electronic, verbal, or written communication sent
 76 |       to the Licensor or its representatives, including but not limited to
 77 |       communication on electronic mailing lists, source code control systems,
 78 |       and issue tracking systems that are managed by, or on behalf of, the
 79 |       Licensor for the purpose of discussing and improving the Work, but
 80 |       excluding communication that is conspicuously marked or otherwise
 81 |       designated in writing by the copyright owner as "Not a Contribution."
 82 | 
 83 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 84 |       on behalf of whom a Contribution has been received by Licensor and
 85 |       subsequently incorporated within the Work.
 86 | 
 87 |    2. Grant of Copyright License. Subject to the terms and conditions of
 88 |       this License, each Contributor hereby grants to You a perpetual,
 89 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 90 |       copyright license to reproduce, prepare Derivative Works of,
 91 |       publicly display, publicly perform, sublicense, and distribute the
 92 |       Work and such Derivative Works in Source or Object form.
 93 | 
 94 |    3. Grant of Patent License. Subject to the terms and conditions of
 95 |       this License, each Contributor hereby grants to You a perpetual,
 96 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 97 |       (except as stated in this section) patent license to make, have made,
 98 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 99 |       where such license applies only to those patent claims licensable
100 |       by such Contributor that are necessarily infringed by their
101 |       Contribution(s) alone or by combination of their Contribution(s)
102 |       with the Work to which such Contribution(s) was submitted. If You
103 |       institute patent litigation against any entity (including a
104 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
105 |       or a Contribution incorporated within the Work constitutes direct
106 |       or contributory patent infringement, then any patent licenses
107 |       granted to You under this License for that Work shall terminate
108 |       as of the date such litigation is filed.
109 | 
110 |    4. Redistribution. You may reproduce and distribute copies of the
111 |       Work or Derivative Works thereof in any medium, with or without
112 |       modifications, and in Source or Object form, provided that You
113 |       meet the following conditions:
114 | 
115 |       (a) You must give any other recipients of the Work or
116 |           Derivative Works a copy of this License; and
117 | 
118 |       (b) You must cause any modified files to carry prominent notices
119 |           stating that You changed the files; and
120 | 
121 |       (c) You must retain, in the Source form of any Derivative Works
122 |           that You distribute, all copyright, patent, trademark, and
123 |           attribution notices from the Source form of the Work,
124 |           excluding those notices that do not pertain to any part of
125 |           the Derivative Works; and
126 | 
127 |       (d) If the Work includes a "NOTICE" text file as part of its
128 |           distribution, then any Derivative Works that You distribute must
129 |           include a readable copy of the attribution notices contained
130 |           within such NOTICE file, excluding those notices that do not
131 |           pertain to any part of the Derivative Works, in at least one
132 |           of the following places: within a NOTICE text file distributed
133 |           as part of the Derivative Works; within the Source form or
134 |           documentation, if provided along with the Derivative Works; or,
135 |           within a display generated by the Derivative Works, if and
136 |           wherever such third-party notices normally appear. The contents
137 |           of the NOTICE file are for informational purposes only and
138 |           do not modify the License. You may add Your own attribution
139 |           notices within Derivative Works that You distribute, alongside
140 |           or as an addendum to the NOTICE text from the Work, provided
141 |           that such additional attribution notices cannot be construed
142 |           as modifying the License.
143 | 
144 |       You may add Your own copyright statement to Your modifications and
145 |       may provide additional or different license terms and conditions
146 |       for use, reproduction, or distribution of Your modifications, or
147 |       for any such Derivative Works as a whole, provided Your use,
148 |       reproduction, and distribution of the Work otherwise complies with
149 |       the conditions stated in this License.
150 | 
151 |    5. Submission of Contributions. Unless You explicitly state otherwise,
152 |       any Contribution intentionally submitted for inclusion in the Work
153 |       by You to the Licensor shall be under the terms and conditions of
154 |       this License, without any additional terms or conditions.
155 |       Notwithstanding the above, nothing herein shall supersede or modify
156 |       the terms of any separate license agreement you may have executed
157 |       with Licensor regarding such Contributions.
158 | 
159 |    6. Trademarks. This License does not grant permission to use the trade
160 |       names, trademarks, service marks, or product names of the Licensor,
161 |       except as required for reasonable and customary use in describing the
162 |       origin of the Work and reproducing the content of the NOTICE file.
163 | 
164 |    7. Disclaimer of Warranty. Unless required by applicable law or
165 |       agreed to in writing, Licensor provides the Work (and each
166 |       Contributor provides its Contributions) on an "AS IS" BASIS,
167 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
168 |       implied, including, without limitation, any warranties or conditions
169 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
170 |       PARTICULAR PURPOSE. You are solely responsible for determining the
171 |       appropriateness of using or redistributing the Work and assume any
172 |       risks associated with Your exercise of permissions under this License.
173 | 
174 |    8. Limitation of Liability. In no event and under no legal theory,
175 |       whether in tort (including negligence), contract, or otherwise,
176 |       unless required by applicable law (such as deliberate and grossly
177 |       negligent acts) or agreed to in writing, shall any Contributor be
178 |       liable to You for damages, including any direct, indirect, special,
179 |       incidental, or consequential damages of any character arising as a
180 |       result of this License or out of the use or inability to use the
181 |       Work (including but not limited to damages for loss of goodwill,
182 |       work stoppage, computer failure or malfunction, or any and all
183 |       other commercial damages or losses), even if such Contributor
184 |       has been advised of the possibility of such damages.
185 | 
186 |    9. Accepting Warranty or Additional Liability. While redistributing
187 |       the Work or Derivative Works thereof, You may choose to offer,
188 |       and charge a fee for, acceptance of support, warranty, indemnity,
189 |       or other liability obligations and/or rights consistent with this
190 |       License. However, in accepting such obligations, You may act only
191 |       on Your own behalf and on Your sole responsibility, not on behalf
192 |       of any other Contributor, and only if You agree to indemnify,
193 |       defend, and hold each Contributor harmless for any liability
194 |       incurred by, or claims asserted against, such Contributor by reason
195 |       of your accepting any such warranty or additional liability.
196 | 
197 |    END OF TERMS AND CONDITIONS
198 | 
199 |    APPENDIX: How to apply the Apache License to your work.
200 | 
201 |       To apply the Apache License to your work, attach the following
202 |       boilerplate notice, with the fields enclosed by brackets "[]"
203 |       replaced with your own identifying information. (Don't include
204 |       the brackets!)  The text should be enclosed in the appropriate
205 |       comment syntax for the file format. We also recommend that a
206 |       file or class name and description of purpose be included on the
207 |       same "printed page" as the copyright notice for easier
208 |       identification within third-party archives.
209 | 
210 |    Copyright [yyyy] [name of copyright owner]
211 | 
212 |    Licensed under the Apache License, Version 2.0 (the "License");
213 |    you may not use this file except in compliance with the License.
214 |    You may obtain a copy of the License at
215 | 
216 |        http://www.apache.org/licenses/LICENSE-2.0
217 | 
218 |    Unless required by applicable law or agreed to in writing, software
219 |    distributed under the License is distributed on an "AS IS" BASIS,
220 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
221 |    See the License for the specific language governing permissions and
222 |    limitations under the License.
223 | 


--------------------------------------------------------------------------------
/code/pytorch_pretrained_bert/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import unicodedata
 23 | import os
 24 | import logging
 25 | 
 26 | from .file_utils import cached_path
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | PRETRAINED_VOCAB_ARCHIVE_MAP = {
 31 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
 32 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
 33 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
 34 |     'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
 35 |     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
 36 |     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
 37 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
 38 | }
 39 | VOCAB_NAME = 'vocab.txt'
 40 | 
 41 | 
 42 | def load_vocab(vocab_file):
 43 |     """Loads a vocabulary file into a dictionary."""
 44 |     vocab = collections.OrderedDict()
 45 |     index = 0
 46 |     with open(vocab_file, "r", encoding="utf-8") as reader:
 47 |         while True:
 48 |             token = reader.readline()
 49 |             if not token:
 50 |                 break
 51 |             token = token.strip()
 52 |             vocab[token] = index
 53 |             index += 1
 54 |     return vocab
 55 | 
 56 | 
 57 | def whitespace_tokenize(text):
 58 |     """Runs basic whitespace cleaning and splitting on a peice of text."""
 59 |     text = text.strip()
 60 |     if not text:
 61 |         return []
 62 |     tokens = text.split()
 63 |     return tokens
 64 | 
 65 | 
 66 | class BertTokenizer(object):
 67 |     """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
 68 |     def __init__(self, vocab_file, do_lower_case=True):
 69 |         if not os.path.isfile(vocab_file):
 70 |             raise ValueError(
 71 |                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
 72 |                 "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
 73 |         self.vocab = load_vocab(vocab_file)
 74 |         self.ids_to_tokens = collections.OrderedDict(
 75 |             [(ids, tok) for tok, ids in self.vocab.items()])
 76 |         self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
 77 |         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
 78 | 
 79 |     def tokenize(self, text):
 80 |         split_tokens = []
 81 |         for token in self.basic_tokenizer.tokenize(text):
 82 |             for sub_token in self.wordpiece_tokenizer.tokenize(token):
 83 |                 split_tokens.append(sub_token)
 84 |         return split_tokens
 85 | 
 86 |     def convert_tokens_to_ids(self, tokens):
 87 |         """Converts a sequence of tokens into ids using the vocab."""
 88 |         ids = []
 89 |         for token in tokens:
 90 |             ids.append(self.vocab[token])
 91 |         return ids
 92 | 
 93 |     def convert_ids_to_tokens(self, ids):
 94 |         """Converts a sequence of ids in wordpiece tokens using the vocab."""
 95 |         tokens = []
 96 |         for i in ids:
 97 |             tokens.append(self.ids_to_tokens[i])
 98 |         return tokens
 99 | 
100 |     @classmethod
101 |     def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
102 |         """
103 |         Instantiate a PreTrainedBertModel from a pre-trained model file.
104 |         Download and cache the pre-trained model file if needed.
105 |         """
106 |         if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
107 |             vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
108 |         else:
109 |             vocab_file = pretrained_model_name
110 |         if os.path.isdir(vocab_file):
111 |             vocab_file = os.path.join(vocab_file, VOCAB_NAME)
112 |         # redirect to the cache, if necessary
113 |         try:
114 |             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
115 |         except FileNotFoundError:
116 |             logger.error(
117 |                 "Model name '{}' was not found in model name list ({}). "
118 |                 "We assumed '{}' was a path or url but couldn't find any file "
119 |                 "associated to this path or url.".format(
120 |                     pretrained_model_name,
121 |                     ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
122 |                     vocab_file))
123 |             return None
124 |         if resolved_vocab_file == vocab_file:
125 |             logger.info("loading vocabulary file {}".format(vocab_file))
126 |         else:
127 |             logger.info("loading vocabulary file {} from cache at {}".format(
128 |                 vocab_file, resolved_vocab_file))
129 |         # Instantiate tokenizer.
130 |         tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
131 |         return tokenizer
132 | 
133 | 
134 | class BasicTokenizer(object):
135 |     """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
136 | 
137 |     def __init__(self, do_lower_case=True):
138 |         """Constructs a BasicTokenizer.
139 | 
140 |         Args:
141 |           do_lower_case: Whether to lower case the input.
142 |         """
143 |         self.do_lower_case = do_lower_case
144 | 
145 |     def tokenize(self, text):
146 |         """Tokenizes a piece of text."""
147 |         text = self._clean_text(text)
148 |         # This was added on November 1st, 2018 for the multilingual and Chinese
149 |         # models. This is also applied to the English models now, but it doesn't
150 |         # matter since the English models were not trained on any Chinese data
151 |         # and generally don't have any Chinese data in them (there are Chinese
152 |         # characters in the vocabulary because Wikipedia does have some Chinese
153 |         # words in the English Wikipedia.).
154 |         text = self._tokenize_chinese_chars(text)
155 |         orig_tokens = whitespace_tokenize(text)
156 |         split_tokens = []
157 |         for token in orig_tokens:
158 |             if self.do_lower_case:
159 |                 token = token.lower()
160 |                 token = self._run_strip_accents(token)
161 |             split_tokens.extend(self._run_split_on_punc(token))
162 | 
163 |         output_tokens = whitespace_tokenize(" ".join(split_tokens))
164 |         return output_tokens
165 | 
166 |     def _run_strip_accents(self, text):
167 |         """Strips accents from a piece of text."""
168 |         text = unicodedata.normalize("NFD", text)
169 |         output = []
170 |         for char in text:
171 |             cat = unicodedata.category(char)
172 |             if cat == "Mn":
173 |                 continue
174 |             output.append(char)
175 |         return "".join(output)
176 | 
177 |     def _run_split_on_punc(self, text):
178 |         """Splits punctuation on a piece of text."""
179 |         chars = list(text)
180 |         i = 0
181 |         start_new_word = True
182 |         output = []
183 |         while i < len(chars):
184 |             char = chars[i]
185 |             if _is_punctuation(char):
186 |                 output.append([char])
187 |                 start_new_word = True
188 |             else:
189 |                 if start_new_word:
190 |                     output.append([])
191 |                 start_new_word = False
192 |                 output[-1].append(char)
193 |             i += 1
194 | 
195 |         return ["".join(x) for x in output]
196 |     
197 |     def _tokenize_chinese_chars(self, text):
198 |         """Adds whitespace around any CJK character."""
199 |         output = []
200 |         for char in text:
201 |             cp = ord(char)
202 |             if self._is_chinese_char(cp):
203 |                 output.append(" ")
204 |                 output.append(char)
205 |                 output.append(" ")
206 |             else:
207 |                 output.append(char)
208 |         return "".join(output)
209 | 
210 |     def _is_chinese_char(self, cp):
211 |         """Checks whether CP is the codepoint of a CJK character."""
212 |         # This defines a "chinese character" as anything in the CJK Unicode block:
213 |         #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
214 |         #
215 |         # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
216 |         # despite its name. The modern Korean Hangul alphabet is a different block,
217 |         # as is Japanese Hiragana and Katakana. Those alphabets are used to write
218 |         # space-separated words, so they are not treated specially and handled
219 |         # like the all of the other languages.
220 |         if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
221 |             (cp >= 0x3400 and cp <= 0x4DBF) or  #
222 |             (cp >= 0x20000 and cp <= 0x2A6DF) or  #
223 |             (cp >= 0x2A700 and cp <= 0x2B73F) or  #
224 |             (cp >= 0x2B740 and cp <= 0x2B81F) or  #
225 |             (cp >= 0x2B820 and cp <= 0x2CEAF) or
226 |             (cp >= 0xF900 and cp <= 0xFAFF) or  #
227 |             (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
228 |             return True
229 |     
230 |         return False
231 |     
232 |     def _clean_text(self, text):
233 |         """Performs invalid character removal and whitespace cleanup on text."""
234 |         output = []
235 |         for char in text:
236 |             cp = ord(char)
237 |             if cp == 0 or cp == 0xfffd or _is_control(char):
238 |                 continue
239 |             if _is_whitespace(char):
240 |                 output.append(" ")
241 |             else:
242 |                 output.append(char)
243 |         return "".join(output)
244 | 
245 | 
246 | class WordpieceTokenizer(object):
247 |     """Runs WordPiece tokenization."""
248 | 
249 |     def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
250 |         self.vocab = vocab
251 |         self.unk_token = unk_token
252 |         self.max_input_chars_per_word = max_input_chars_per_word
253 | 
254 |     def tokenize(self, text):
255 |         """Tokenizes a piece of text into its word pieces.
256 | 
257 |         This uses a greedy longest-match-first algorithm to perform tokenization
258 |         using the given vocabulary.
259 | 
260 |         For example:
261 |           input = "unaffable"
262 |           output = ["un", "##aff", "##able"]
263 | 
264 |         Args:
265 |           text: A single token or whitespace separated tokens. This should have
266 |             already been passed through `BasicTokenizer.
267 | 
268 |         Returns:
269 |           A list of wordpiece tokens.
270 |         """
271 | 
272 |         output_tokens = []
273 |         for token in whitespace_tokenize(text):
274 |             chars = list(token)
275 |             if len(chars) > self.max_input_chars_per_word:
276 |                 output_tokens.append(self.unk_token)
277 |                 continue
278 | 
279 |             is_bad = False
280 |             start = 0
281 |             sub_tokens = []
282 |             while start < len(chars):
283 |                 end = len(chars)
284 |                 cur_substr = None
285 |                 while start < end:
286 |                     substr = "".join(chars[start:end])
287 |                     if start > 0:
288 |                         substr = "##" + substr
289 |                     if substr in self.vocab:
290 |                         cur_substr = substr
291 |                         break
292 |                     end -= 1
293 |                 if cur_substr is None:
294 |                     is_bad = True
295 |                     break
296 |                 sub_tokens.append(cur_substr)
297 |                 start = end
298 | 
299 |             if is_bad:
300 |                 output_tokens.append(self.unk_token)
301 |             else:
302 |                 output_tokens.extend(sub_tokens)
303 |         return output_tokens
304 | 
305 | 
306 | def _is_whitespace(char):
307 |     """Checks whether `chars` is a whitespace character."""
308 |     # \t, \n, and \r are technically contorl characters but we treat them
309 |     # as whitespace since they are generally considered as such.
310 |     if char == " " or char == "\t" or char == "\n" or char == "\r":
311 |         return True
312 |     cat = unicodedata.category(char)
313 |     if cat == "Zs":
314 |         return True
315 |     return False
316 | 
317 | 
318 | def _is_control(char):
319 |     """Checks whether `chars` is a control character."""
320 |     # These are technically control characters but we count them as whitespace
321 |     # characters.
322 |     if char == "\t" or char == "\n" or char == "\r":
323 |         return False
324 |     cat = unicodedata.category(char)
325 |     if cat.startswith("C"):
326 |         return True
327 |     return False
328 | 
329 | 
330 | def _is_punctuation(char):
331 |     """Checks whether `chars` is a punctuation character."""
332 |     cp = ord(char)
333 |     # We treat all non-letter/number ASCII as punctuation.
334 |     # Characters such as "^", "$", and "`" are not in the Unicode
335 |     # Punctuation class but we treat them as punctuation anyways, for
336 |     # consistency.
337 |     if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
338 |             (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
339 |         return True
340 |     cat = unicodedata.category(char)
341 |     if cat.startswith("P"):
342 |         return True
343 |     return False
344 | 


--------------------------------------------------------------------------------
/code/run_arc.py:
--------------------------------------------------------------------------------
  1 | """BERT finetuning runner."""
  2 | import logging
  3 | import os
  4 | import argparse
  5 | import random
  6 | import collections
  7 | from tqdm import tqdm, trange
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | import torch
 12 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 13 | from torch.utils.data.distributed import DistributedSampler
 14 | 
 15 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 16 | from pytorch_pretrained_bert.modeling import BertForMultipleChoice
 17 | from pytorch_pretrained_bert.optimization import BertAdam
 18 | 
 19 | 
 20 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 21 |                     datefmt = '%m/%d/%Y %H:%M:%S',
 22 |                     level = logging.INFO)
 23 | logger = logging.getLogger(__name__)
 24 | DEBUG = False
 25 | 
 26 | 
 27 | class SwagExample(object):
 28 |     """A single training/test example for the SWAG dataset."""
 29 |     def __init__(self,
 30 |                  swag_id,
 31 |                  context_sentence,
 32 |                  start_ending,
 33 |                  endings,
 34 |                  label=None):
 35 |         self.swag_id = swag_id
 36 |         self.context_sentence = context_sentence
 37 |         self.start_ending = start_ending
 38 |         self.endings = endings
 39 |         self.label = label
 40 | 
 41 |     def __str__(self):
 42 |         return self.__repr__()
 43 | 
 44 |     def __repr__(self):
 45 |         l = [
 46 |             f"swag_id: {self.swag_id}",
 47 |             f"context_sentence: {self.context_sentence}",
 48 |             f"start_ending: {self.start_ending}",
 49 |             f"ending_0: {self.endings[0]}",
 50 |             f"ending_1: {self.endings[1]}",
 51 |             f"ending_2: {self.endings[2]}",
 52 |             f"ending_3: {self.endings[3]}",
 53 |         ]
 54 | 
 55 |         if self.label is not None:
 56 |             l.append(f"label: {self.label}")
 57 | 
 58 |         return ", ".join(l)
 59 | 
 60 | 
 61 | class InputFeatures(object):
 62 |     def __init__(self,
 63 |                  example_id,
 64 |                  choices_features,
 65 |                  label
 66 | 
 67 |     ):
 68 |         self.example_id = example_id
 69 |         self.choices_features = [
 70 |             {
 71 |                 'input_ids': input_ids,
 72 |                 'input_mask': input_mask,
 73 |                 'segment_ids': segment_ids
 74 |             }
 75 |             for _, input_ids, input_mask, segment_ids in choices_features
 76 |         ]
 77 |         self.label = label
 78 | 
 79 | 
 80 | def read_swag_examples(input_file):
 81 |     import json
 82 |     import string
 83 | 
 84 |     with open(input_file, 'r') as f:
 85 |         data = json.load(f)
 86 | 
 87 |     examples = []
 88 |     answer_mapping = {string.ascii_uppercase[i]: i for i in range(4)}
 89 | 
 90 |     for (i, passage_id) in enumerate(data.keys()):
 91 |         for question_id in range(len(data[passage_id]['questions'])):
 92 |             guid = '%s-%s' % (passage_id, str(question_id))
 93 |             article = data[passage_id]['article']
 94 |             question = data[passage_id]['questions'][question_id]
 95 |             choices = data[passage_id]['options'][question_id]
 96 |             label = answer_mapping[data[passage_id]['answers'][question_id].upper()]
 97 |             examples.append(
 98 |                 SwagExample(swag_id=guid, start_ending=question, endings=choices, context_sentence=article, label=label))
 99 |     return examples
100 | 
101 | 
102 | def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training, rev=False):
103 |     """Loads a data file into a list of `InputBatch`s."""
104 | 
105 |     # Swag is a multiple choice task. To perform this task using Bert,
106 |     # we will use the formatting proposed in "Improving Language
107 |     # Understanding by Generative Pre-Training" and suggested by
108 |     # @jacobdevlin-google in this issue
109 |     # https://github.com/google-research/bert/issues/38.
110 |     #
111 |     # Each choice will correspond to a sample on which we run the
112 |     # inference. For a given Swag example, we will create the 4
113 |     # following inputs:
114 |     # - [CLS] context [SEP] choice_1 [SEP]
115 |     # - [CLS] context [SEP] choice_2 [SEP]
116 |     # - [CLS] context [SEP] choice_3 [SEP]
117 |     # - [CLS] context [SEP] choice_4 [SEP]
118 |     # The model will output a single value for each input. To get the
119 |     # final decision of the model, we will run a softmax over these 4
120 |     # outputs.
121 |     features = []
122 |     for example_index, example in enumerate(examples):
123 |         context_tokens = tokenizer.tokenize(example.context_sentence)
124 |         start_ending_tokens = tokenizer.tokenize(example.start_ending)
125 | 
126 |         choices_features = []
127 |         for ending_index, ending in enumerate(example.endings):
128 |             # We create a copy of the context tokens in order to be
129 |             # able to shrink it according to ending_tokens
130 |             context_tokens_choice = context_tokens[:]
131 |             ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
132 |             # Modifies `context_tokens_choice` and `ending_tokens` in
133 |             # place so that the total length is less than the
134 |             # specified length.  Account for [CLS], [SEP], [SEP] with
135 |             # "- 3"
136 |             _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
137 | 
138 |             if rev:
139 |                 tokens = ["[CLS]"] + ending_tokens + ["[SEP]"] + context_tokens_choice + ["[SEP]"]
140 |             else:
141 |                 tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
142 |             segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
143 | 
144 |             input_ids = tokenizer.convert_tokens_to_ids(tokens)
145 |             input_mask = [1] * len(input_ids)
146 | 
147 |             # Zero-pad up to the sequence length.
148 |             padding = [0] * (max_seq_length - len(input_ids))
149 |             input_ids += padding
150 |             input_mask += padding
151 |             segment_ids += padding
152 | 
153 |             assert len(input_ids) == max_seq_length
154 |             assert len(input_mask) == max_seq_length
155 |             assert len(segment_ids) == max_seq_length
156 | 
157 |             choices_features.append((tokens, input_ids, input_mask, segment_ids))
158 | 
159 |         label = example.label
160 |         if example_index < 0:
161 |             logger.info("*** Example ***")
162 |             logger.info(f"swag_id: {example.swag_id}")
163 |             for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
164 |                 logger.info(f"choice: {choice_idx}")
165 |                 logger.info(f"tokens: {' '.join(tokens)}")
166 |                 logger.info(f"input_ids: {' '.join(map(str, input_ids))}")
167 |                 logger.info(f"input_mask: {' '.join(map(str, input_mask))}")
168 |                 logger.info(f"segment_ids: {' '.join(map(str, segment_ids))}")
169 |             if is_training:
170 |                 logger.info(f"label: {label}")
171 | 
172 |         features.append(
173 |             InputFeatures(
174 |                 example_id=example.swag_id,
175 |                 choices_features=choices_features,
176 |                 label=label
177 |             )
178 |         )
179 | 
180 |     return features
181 | 
182 | 
183 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
184 |     """Truncates a sequence pair in place to the maximum length."""
185 | 
186 |     # This is a simple heuristic which will always truncate the longer sequence
187 |     # one token at a time. This makes more sense than truncating an equal percent
188 |     # of tokens from each, since if one sequence is very short then each token
189 |     # that's truncated likely contains more information than a longer sequence.
190 |     while True:
191 |         total_length = len(tokens_a) + len(tokens_b)
192 |         if total_length <= max_length:
193 |             break
194 |         if len(tokens_a) > len(tokens_b):
195 |             tokens_a.pop()
196 |         else:
197 |             tokens_b.pop()
198 | 
199 | 
200 | def accuracy(out, labels):
201 |     outputs = np.argmax(out, axis=1)
202 |     return np.sum(outputs == labels)
203 | 
204 | 
205 | def select_field(features, field):
206 |     return [
207 |         [
208 |             choice[field]
209 |             for choice in feature.choices_features
210 |         ]
211 |         for feature in features
212 |     ]
213 | 
214 | 
215 | def warmup_linear(x, warmup=0.002):
216 |     if x < warmup:
217 |         return x/warmup
218 |     return 1.0 - x
219 | 
220 | 
221 | parser = argparse.ArgumentParser()
222 | 
223 | ## Required parameters
224 | parser.add_argument("--bert_dir",
225 |                     default=None,
226 |                     type=str,
227 |                     required=True,
228 |                     help="The BERT directory, relative to MODEL_FOLDER")
229 | 
230 | ## Environment parameters
231 | parser.add_argument("--ON_AZURE",
232 |                     default=False,
233 |                     help="if training the mode on azure")
234 | parser.add_argument("--ON_PHILLY",
235 |                     default=False,
236 |                     type=bool,
237 |                     help="if training the mode on philly")
238 | parser.add_argument("--DATA_FOLDER",
239 |                     default=None,
240 |                     type=str,
241 |                     help="The global data folder, for AZURE")
242 | parser.add_argument("--task_name",
243 |                     default=None,
244 |                     type=str,
245 |                     help="The name of the task to train.")
246 | parser.add_argument("--output_dir",
247 |                     default='./outputs',
248 |                     type=str,
249 |                     help="The output directory where the model checkpoints will be written.")
250 | 
251 | ## Other parameters
252 | parser.add_argument("--rev",
253 |                     default=False,
254 |                     type=bool,
255 |                     help="Reverse the order of LM or not, default is False")
256 | parser.add_argument("--init_checkpoint",
257 |                     default=None,
258 |                     type=str,
259 |                     help="Initial checkpoint (usually from a pre-trained BERT model), relative to MODEL_FOLDER")
260 | parser.add_argument("--max_seq_length",
261 |                     default=128,
262 |                     type=int,
263 |                     help="The maximum total input sequence length after WordPiece tokenization. \n"
264 |                          "Sequences longer than this will be truncated, and sequences shorter \n"
265 |                          "than this will be padded.")
266 | parser.add_argument("--do_train",
267 |                     default=False,
268 |                     type=bool,
269 |                     help="Whether to run training.")
270 | parser.add_argument("--do_eval",
271 |                     default=False,
272 |                     type=bool,
273 |                     help="Whether to run eval on the dev set.")
274 | parser.add_argument("--do_lower_case",
275 |                     default=True,
276 |                     action='store_true',
277 |                     help="Set this flag if you are using an uncased model.")
278 | parser.add_argument("--train_batch_size",
279 |                     default=32,
280 |                     type=int,
281 |                     help="Total batch size for training.")
282 | parser.add_argument("--eval_batch_size",
283 |                     default=8,
284 |                     type=int,
285 |                     help="Total batch size for eval.")
286 | parser.add_argument("--learning_rate",
287 |                     default=5e-5,
288 |                     type=float,
289 |                     help="The initial learning rate for Adam.")
290 | parser.add_argument("--num_train_epochs",
291 |                     default=3.0,
292 |                     type=float,
293 |                     help="Total number of training epochs to perform.")
294 | parser.add_argument("--warmup_proportion",
295 |                     default=0.1,
296 |                     type=float,
297 |                     help="Proportion of training to perform linear learning rate warmup for. "
298 |                          "E.g., 0.1 = 10%% of training.")
299 | parser.add_argument("--no_cuda",
300 |                     default=False,
301 |                     action='store_true',
302 |                     help="Whether not to use CUDA when available")
303 | parser.add_argument("--local_rank",
304 |                     type=int,
305 |                     default=-1,
306 |                     help="local_rank for distributed training on gpus")
307 | parser.add_argument('--seed',
308 |                     type=int,
309 |                     default=42,
310 |                     help="random seed for initialization")
311 | parser.add_argument('--gradient_accumulation_steps',
312 |                     type=int,
313 |                     default=1,
314 |                     help="Number of updates steps to accumulate before performing a backward/update pass.")
315 | parser.add_argument('--fp16',
316 |                     type = bool,
317 |                     default=False,
318 |                     #action='store_true',
319 |                     help="Whether to use 16-bit float precision instead of 32-bit")
320 | parser.add_argument('--loss_scale',
321 |                     type=float, default=0,
322 |                     help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
323 |                          "0 (default value): dynamic loss scaling.\n"
324 |                          "Positive power of 2: static loss scaling value.\n")
325 | parser.add_argument('--test_feature_file',
326 |                     type=str,
327 |                     default=None,
328 |                     help="test feature file for evaluation, only used when do_eval is True")
329 | 
330 | 
331 | args = parser.parse_args()
332 | 
333 | if args.ON_PHILLY:
334 |     logger.info('Training on PHILLY')
335 |     #BLANK because this is Microsoft Only
336 | elif args.ON_AZURE:
337 |     logger.info('Training on AZURE')
338 |     #BLANK because this is Microsoft Only
339 | else:
340 |     logger.info('Training on LOCAL')
341 | 
342 | assert args.DATA_FOLDER is not None, "DATA FOLDER cannot be None"
343 | 
344 | task_name = args.task_name.lower()
345 | args.data_dir = '/'.join([args.DATA_FOLDER, 'data', task_name])  # os.path.join(args.DATA_FOLDER, 'data', task_name)
346 | args.MODEL_FOLDER = '/'.join([args.DATA_FOLDER, 'model'])  # os.path.join(args.DATA_FOLDER, 'model')
347 | args.bert_model = '/'.join([args.MODEL_FOLDER, args.bert_dir])  # os.path.join(args.MODEL_FOLDER, args.bert_dir)
348 | if args.init_checkpoint is not None:
349 |     # args.init_checkpoint = os.path.join(args.MODEL_FOLDER, args.init_checkpoint)
350 |     args.init_checkpoint = '/'.join([args.MODEL_FOLDER, args.init_checkpoint])
351 | 
352 | logger.info('Input Argument Information')
353 | args_dict = vars(args)
354 | for a in args_dict:
355 |     logger.info('%-28s  %s' % (a, args_dict[a]))
356 | 
357 | 
358 | ##########################################################################
359 | #  Get Machine Configuration and check input arguments
360 | ##########################################################################
361 | if args.local_rank == -1 or args.no_cuda:
362 |     device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
363 |     n_gpu = torch.cuda.device_count()
364 | else:
365 |     torch.cuda.set_device(args.local_rank)
366 |     device = torch.device("cuda", args.local_rank)
367 |     n_gpu = 1
368 |     # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
369 |     torch.distributed.init_process_group(backend='nccl')
370 | logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
371 |     device, n_gpu, bool(args.local_rank != -1), args.fp16))
372 | 
373 | if args.gradient_accumulation_steps < 1:
374 |     raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
375 |                         args.gradient_accumulation_steps))
376 | 
377 | args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
378 | logger.info('training batch size on each GPU = %d' % args.train_batch_size)
379 | 
380 | args.seed = random.randint(0, 100000000)
381 | logger.info('new seed = %d' % args.seed)
382 | 
383 | random.seed(args.seed)
384 | np.random.seed(args.seed)
385 | torch.manual_seed(args.seed)
386 | if n_gpu > 0:
387 |     torch.cuda.manual_seed_all(args.seed)
388 | 
389 | if not args.do_train and not args.do_eval:
390 |     raise ValueError("At least one of `do_train` or `do_eval` must be True.")
391 | 
392 | # if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
393 | #     raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
394 | os.makedirs(args.output_dir, exist_ok=True)
395 | 
396 | 
397 | ##########################################################################
398 | # Prepare for Model
399 | ##########################################################################
400 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
401 | logger.info('loading pretrained bert model from %s' % args.init_checkpoint)
402 | model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
403 | if args.init_checkpoint is not None:
404 |     logger.info('loading finetuned model from %s' % args.init_checkpoint)
405 |     model_state_dict = torch.load(args.init_checkpoint)
406 |     model_state_dict_new = collections.OrderedDict()
407 |     for t in model_state_dict:
408 |         new_key = t
409 |         if t.startswith('module.'):
410 |             new_key = t.replace('module.', '')
411 |         model_state_dict_new[new_key] = model_state_dict[t]
412 |     model.load_state_dict(model_state_dict_new)
413 | 
414 | if args.fp16:
415 |     model.half()
416 | model.to(device)
417 | if args.local_rank != -1:
418 |     try:
419 |         from apex.parallel import DistributedDataParallel as DDP
420 |     except ImportError:
421 |         raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
422 | 
423 |     model = DDP(model)
424 | elif n_gpu > 1:
425 |     model = torch.nn.DataParallel(model)
426 | 
427 | 
428 | ##########################################################################
429 | #  Process Data
430 | ##########################################################################
431 | if args.do_train:
432 |     #train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.' + task_name + '.json'))
433 |     train_examples = read_swag_examples('/'.join([args.data_dir, 'train.' + task_name + '.json']))
434 |     num_train_steps = int(len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
435 | 
436 |     # Prepare optimizer
437 |     param_optimizer = list(model.named_parameters())
438 | 
439 |     # hack to remove pooler, which is not used
440 |     # thus it produce None grad that break apex
441 |     param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
442 | 
443 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
444 |     optimizer_grouped_parameters = [
445 |         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
446 |         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
447 |         ]
448 |     t_total = num_train_steps
449 |     if args.local_rank != -1:
450 |         t_total = t_total // torch.distributed.get_world_size()
451 |     if args.fp16:
452 |         try:
453 |             from apex.optimizers import FP16_Optimizer
454 |             from apex.optimizers import FusedAdam
455 |         except ImportError:
456 |             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
457 | 
458 |         optimizer = FusedAdam(optimizer_grouped_parameters,
459 |                               lr=args.learning_rate,
460 |                               bias_correction=False,
461 |                               max_grad_norm=1.0)
462 |         if args.loss_scale == 0:
463 |             optimizer = FP16_Optimizer(optimizer, dynaimc_loss_scale=True)
464 |         else:
465 |             optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
466 |     else:
467 |         optimizer = BertAdam(optimizer_grouped_parameters,
468 |                              lr=args.learning_rate,
469 |                              warmup=args.warmup_proportion,
470 |                              t_total=t_total)
471 | 
472 |     global_step = 0
473 | 
474 |     # Prepare Tensor Data and train Dataloader
475 |     train_features = convert_examples_to_features(
476 |         train_examples, tokenizer, args.max_seq_length, True, args.rev)
477 |     logger.info("***** Running training *****")
478 |     logger.info("  Num examples = %d", len(train_examples))
479 |     logger.info("  Batch size = %d", args.train_batch_size)
480 |     logger.info("  Num steps = %d", num_train_steps)
481 |     all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
482 |     all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
483 |     all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
484 |     all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
485 |     train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
486 |     if args.local_rank == -1:
487 |         train_sampler = RandomSampler(train_data)
488 |     else:
489 |         train_sampler = DistributedSampler(train_data)
490 |     train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
491 | 
492 |     # Prepare Tensor Data and dev Dataloader
493 |     # eval_examples = read_swag_examples(os.path.join(args.data_dir, 'dev.'   + task_name + '.json'))
494 |     eval_examples = read_swag_examples('/'.join([args.data_dir, 'dev.'   + task_name + '.json']))
495 |     eval_features = convert_examples_to_features(
496 |         eval_examples, tokenizer, args.max_seq_length, True, args.rev)
497 |     logger.info("***** Running evaluation *****")
498 |     logger.info("  Num examples = %d", len(eval_examples))
499 |     logger.info("  Batch size = %d", args.eval_batch_size)
500 |     all_input_ids_eval = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
501 |     all_input_mask_eval = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
502 |     all_segment_ids_eval = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
503 |     all_label_eval = torch.tensor([f.label for f in eval_features], dtype=torch.long)
504 |     eval_data = TensorDataset(all_input_ids_eval, all_input_mask_eval, all_segment_ids_eval, all_label_eval)
505 |     # Run prediction for full data
506 |     eval_sampler = SequentialSampler(eval_data)
507 |     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
508 | 
509 |     # Training
510 |     model.train()
511 |     for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
512 |         tr_loss = 0
513 |         nb_tr_examples, nb_tr_steps = 0, 0
514 |         # for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", miniters=int(len(train_dataloader)/100))):
515 |         for step, batch in enumerate(train_dataloader):
516 |             batch = tuple(t.to(device) for t in batch)
517 |             input_ids, input_mask, segment_ids, label_ids = batch
518 |             loss = model(input_ids, segment_ids, input_mask, label_ids)
519 |             if n_gpu > 1:
520 |                 loss = loss.mean() # mean() to average on multi-gpu.
521 |             if args.fp16 and args.loss_scale != 1.0:
522 |                 # rescale loss for fp16 training
523 |                 # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
524 |                 loss = loss * args.loss_scale
525 |             if args.gradient_accumulation_steps > 1:
526 |                 loss = loss / args.gradient_accumulation_steps
527 |             tr_loss += loss.item()
528 |             nb_tr_examples += input_ids.size(0)
529 |             nb_tr_steps += 1
530 | 
531 |             if args.fp16:
532 |                 optimizer.backward(loss)
533 |             else:
534 |                 loss.backward()
535 |             if (step + 1) % args.gradient_accumulation_steps == 0:
536 |                 # modify learning rate with special warm up BERT uses
537 |                 lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion)
538 |                 for param_group in optimizer.param_groups:
539 |                     param_group['lr'] = lr_this_step
540 |                 optimizer.step()
541 |                 optimizer.zero_grad()
542 |                 global_step += 1
543 | 
544 |         # Save a trained model for each epoch
545 |         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
546 |         #output_model_file = os.path.join(args.output_dir, "%s-%d-%.6f.bin" % (task_name, epoch, args.learning_rate))
547 |         output_model_file = '/'.join([args.output_dir, "%s-%d-%.6f-%d.bin" % (task_name, epoch, args.learning_rate,
548 |                                                                               args.train_batch_size)])
549 |         torch.save(model_to_save.state_dict(), output_model_file)
550 | 
551 |         # Evaluation on eval dataset
552 |         model.eval()
553 |         eval_loss, eval_accuracy = 0, 0
554 |         nb_eval_steps, nb_eval_examples = 0, 0
555 |         for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
556 |             input_ids = input_ids.to(device)
557 |             input_mask = input_mask.to(device)
558 |             segment_ids = segment_ids.to(device)
559 |             label_ids = label_ids.to(device)
560 | 
561 |             with torch.no_grad():
562 |                 tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
563 |                 logits = model(input_ids, segment_ids, input_mask)
564 | 
565 |             logits = logits.detach().cpu().numpy()
566 |             label_ids = label_ids.to('cpu').numpy()
567 |             tmp_eval_accuracy = accuracy(logits, label_ids)
568 | 
569 |             eval_loss += tmp_eval_loss.mean().item()
570 |             eval_accuracy += tmp_eval_accuracy
571 | 
572 |             nb_eval_examples += input_ids.size(0)
573 |             nb_eval_steps += 1
574 | 
575 |         eval_loss = eval_loss / nb_eval_steps
576 |         eval_accuracy = eval_accuracy / nb_eval_examples
577 | 
578 |         result = {'eval_loss': eval_loss,
579 |                   'eval_accuracy': eval_accuracy,
580 |                   'global_step': global_step,
581 |                   'loss': tr_loss/nb_tr_steps}
582 | 
583 |         logger.info("***** Eval results *****")
584 |         for key in sorted(result.keys()):
585 |             logger.info("  %s = %s", key, str(result[key]))
586 | 
587 | if args.do_eval:
588 |     test_files = args.test_feature_file
589 |     if test_files is None:
590 |         #test_files = os.path.join(args.data_dir, 'test.' + task_name + '.json')
591 |         test_files = '/'.join([args.data_dir, 'test.' + task_name + '.json'])
592 |     for f in test_files.split(','):
593 |         logger.info('for test file %s' % f)
594 |         f = os.path.join(args.DATA_FOLDER, 'data', f)
595 |         test_examples = read_swag_examples(f)
596 | 
597 |         test_features = convert_examples_to_features(test_examples, tokenizer, args.max_seq_length, True, args.rev)
598 |         logger.info("***** Running Testing *****")
599 |         logger.info("  Num examples = %d", len(test_examples))
600 |         logger.info("  Batch size = %d", args.eval_batch_size)
601 |         all_input_ids = torch.tensor(select_field(test_features, 'input_ids'), dtype=torch.long)
602 |         all_input_mask = torch.tensor(select_field(test_features, 'input_mask'), dtype=torch.long)
603 |         all_segment_ids = torch.tensor(select_field(test_features, 'segment_ids'), dtype=torch.long)
604 |         all_label = torch.tensor([f.label for f in test_features], dtype=torch.long)
605 |         test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
606 |         test_dataloader = DataLoader(test_data, batch_size=args.eval_batch_size)
607 | 
608 |         model.eval()
609 |         eval_loss, eval_accuracy = 0, 0
610 |         nb_eval_steps, nb_eval_examples = 0, 0
611 | 
612 |         pred_logits = []
613 |         test_labels = []
614 |         for input_ids, input_mask, segment_ids, label_ids in test_dataloader:
615 |             input_ids = input_ids.to(device)
616 |             input_mask = input_mask.to(device)
617 |             segment_ids = segment_ids.to(device)
618 |             label_ids = label_ids.to(device)
619 | 
620 |             with torch.no_grad():
621 |                 tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
622 |                 logits = model(input_ids, segment_ids, input_mask)
623 | 
624 |             logits = logits.detach().cpu().numpy()
625 |             label_ids = label_ids.to('cpu').numpy()
626 |             tmp_eval_accuracy = accuracy(logits, label_ids)
627 | 
628 |             pred_logits.append(logits)
629 |             test_labels.append(label_ids)
630 | 
631 |             eval_loss += tmp_eval_loss.mean().item()
632 |             eval_accuracy += tmp_eval_accuracy
633 | 
634 |             nb_eval_examples += input_ids.size(0)
635 |             nb_eval_steps += 1
636 | 
637 |         eval_loss = eval_loss / nb_eval_steps
638 |         eval_accuracy = eval_accuracy / nb_eval_examples
639 | 
640 |         result = {'eval_loss': eval_loss,
641 |                   'eval_accuracy': eval_accuracy}
642 | 
643 |         logger.info("***** Eval results *****")
644 |         for key in sorted(result.keys()):
645 |             logger.info("  %s = %s", key, str(result[key]))
646 | 
647 |         test_label_flat = np.concatenate(test_labels)
648 |         pred_logits_flat = np.concatenate(pred_logits)
649 | 
650 |         def np_softmax(x):
651 |             """Compute softmax values for each sets of scores in x."""
652 |             e_x = np.exp(x - np.max(x))
653 |             return e_x / e_x.sum(axis=0)
654 | 
655 |         test_ids = [test_examples[i].swag_id for i in range(0, len(test_examples))]
656 |         answers = ['A', 'B', 'C', 'D']
657 |         probs = np_softmax(pred_logits_flat.T).T
658 |         preds = probs.argmax(1)
659 | 
660 |         test_res = pd.DataFrame()
661 |         test_res['id'] = test_ids[:len(preds)]
662 |         test_res['answer'] = [answers[i] for i in preds]
663 |         test_res['label'] = [answers[i] for i in test_label_flat]
664 |         for idx_a, a in enumerate(answers):
665 |             test_res['probs' + a] = probs[:, idx_a]
666 |         test_res_file = os.path.basename(args.init_checkpoint) + '-' + os.path.basename(f) + '.csv'
667 |         #test_res.to_csv(os.path.join(args.output_dir, test_res_file))
668 |         test_res.to_csv('/'.join([args.output_dir, test_res_file]))
669 | 


--------------------------------------------------------------------------------
/code/README.md:
--------------------------------------------------------------------------------
  1 | # PyTorch Pretrained Bert
  2 | 
  3 | This repository contains an op-for-op PyTorch reimplementation of [Google's TensorFlow repository for the BERT model](https://github.com/google-research/bert) that was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
  4 | 
  5 | This implementation is provided with [Google's pre-trained models](https://github.com/google-research/bert), examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
  6 | 
  7 | ## Content
  8 | 
  9 | | Section | Description |
 10 | |-|-|
 11 | | [Installation](#installation) | How to install the package |
 12 | | [Overview](#overview) | Overview of the package |
 13 | | [Usage](#usage) | Quickstart examples |
 14 | | [Doc](#doc) |  Detailed documentation |
 15 | | [Examples](#examples) | Detailed examples on how to fine-tune Bert |
 16 | | [Notebooks](#notebooks) | Introduction on the provided Jupyter Notebooks |
 17 | | [TPU](#tpu) | Notes on TPU support and pretraining scripts |
 18 | | [Command-line interface](#Command-line-interface) | Convert a TensorFlow checkpoint in a PyTorch dump |
 19 | 
 20 | ## Installation
 21 | 
 22 | This repo was tested on Python 3.5+ and PyTorch 0.4.1/1.0.0
 23 | 
 24 | ### With pip
 25 | 
 26 | PyTorch pretrained bert can be installed by pip as follows:
 27 | ```bash
 28 | pip install pytorch-pretrained-bert
 29 | ```
 30 | 
 31 | ### From source
 32 | 
 33 | Clone the repository and run:
 34 | ```bash
 35 | pip install [--editable] .
 36 | ```
 37 | 
 38 | A series of tests is included in the [tests folder](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/tests) and can be run using `pytest` (install pytest if needed: `pip install pytest`).
 39 | 
 40 | You can run the tests with the command:
 41 | ```bash
 42 | python -m pytest -sv tests/
 43 | ```
 44 | 
 45 | ## Overview
 46 | 
 47 | This package comprises the following classes that can be imported in Python and are detailed in the [Doc](#doc) section of this readme:
 48 | 
 49 | - Eight PyTorch models (`torch.nn.Module`) for Bert with pre-trained weights (in the [`modeling.py`](./pytorch_pretrained_bert/modeling.py) file):
 50 |   - [`BertModel`](./pytorch_pretrained_bert/modeling.py#L537) - raw BERT Transformer model (**fully pre-trained**),
 51 |   - [`BertForMaskedLM`](./pytorch_pretrained_bert/modeling.py#L691) - BERT Transformer with the pre-trained masked language modeling head on top (**fully pre-trained**),
 52 |   - [`BertForNextSentencePrediction`](./pytorch_pretrained_bert/modeling.py#L752) - BERT Transformer with the pre-trained next sentence prediction classifier on top  (**fully pre-trained**),
 53 |   - [`BertForPreTraining`](./pytorch_pretrained_bert/modeling.py#L620) - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (**fully pre-trained**),
 54 |   - [`BertForSequenceClassification`](./pytorch_pretrained_bert/modeling.py#L814) - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**, the sequence classification head **is only initialized and has to be trained**),
 55 |   - [`BertForMultipleChoice`](./pytorch_pretrained_bert/modeling.py#L880) - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
 56 |   - [`BertForTokenClassification`](./pytorch_pretrained_bert/modeling.py#L949) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**),
 57 |   - [`BertForQuestionAnswering`](./pytorch_pretrained_bert/modeling.py#L1015) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**).
 58 | 
 59 | - Three tokenizers (in the [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) file):
 60 |   - `BasicTokenizer` - basic tokenization (punctuation splitting, lower casing, etc.),
 61 |   - `WordpieceTokenizer` - WordPiece tokenization,
 62 |   - `BertTokenizer` - perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
 63 | 
 64 | - One optimizer (in the [`optimization.py`](./pytorch_pretrained_bert/optimization.py) file):
 65 |   - `BertAdam` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 66 | 
 67 | - A configuration class (in the [`modeling.py`](./pytorch_pretrained_bert/modeling.py) file):
 68 |   - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilisities to read and write from JSON configuration files.
 69 | 
 70 | The repository further comprises:
 71 | 
 72 | - Four examples on how to use Bert (in the [`examples` folder](./examples)):
 73 |   - [`extract_features.py`](./examples/extract_features.py) - Show how to extract hidden states from an instance of `BertModel`,
 74 |   - [`run_classifier.py`](./examples/run_classifier.py) - Show how to fine-tune an instance of `BertForSequenceClassification` on GLUE's MRPC task,
 75 |   - [`run_squad.py`](./examples/run_squad.py) - Show how to fine-tune an instance of `BertForQuestionAnswering` on SQuAD v1.0 task.
 76 |   - [`run_swag.py`](./examples/run_swag.py) - Show how to fine-tune an instance of `BertForMultipleChoice` on Swag task.
 77 | 
 78 |   These examples are detailed in the [Examples](#examples) section of this readme.
 79 | 
 80 | - Three notebooks that were used to check that the TensorFlow and PyTorch models behave identically (in the [`notebooks` folder](./notebooks)):
 81 |   - [`Comparing-TF-and-PT-models.ipynb`](./notebooks/Comparing-TF-and-PT-models.ipynb) - Compare the hidden states predicted by `BertModel`,
 82 |   - [`Comparing-TF-and-PT-models-SQuAD.ipynb`](./notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb) - Compare the spans predicted by  `BertForQuestionAnswering` instances,
 83 |   - [`Comparing-TF-and-PT-models-MLM-NSP.ipynb`](./notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb) - Compare the predictions of the `BertForPretraining` instances.
 84 | 
 85 |   These notebooks are detailed in the [Notebooks](#notebooks) section of this readme.
 86 | 
 87 | - A command-line interface to convert any TensorFlow checkpoint in a PyTorch dump:
 88 | 
 89 |   This CLI is detailed in the [Command-line interface](#Command-line-interface) section of this readme.
 90 | 
 91 | ## Usage
 92 | 
 93 | Here is a quick-start example using `BertTokenizer`, `BertModel` and `BertForMaskedLM` class with Google AI's pre-trained `Bert base uncased` model. See the [doc section](#doc) below for all the details on these classes.
 94 | 
 95 | First let's prepare a tokenized input with `BertTokenizer`
 96 | 
 97 | ```python
 98 | import torch
 99 | from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
100 | 
101 | # Load pre-trained model tokenizer (vocabulary)
102 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
103 | 
104 | # Tokenized input
105 | text = "Who was Jim Henson ? Jim Henson was a puppeteer"
106 | tokenized_text = tokenizer.tokenize(text)
107 | 
108 | # Mask a token that we will try to predict back with `BertForMaskedLM`
109 | masked_index = 6
110 | tokenized_text[masked_index] = '[MASK]'
111 | assert tokenized_text == ['who', 'was', 'jim', 'henson', '?', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer']
112 | 
113 | # Convert token to vocabulary indices
114 | indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
115 | # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
116 | segments_ids = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
117 | 
118 | # Convert inputs to PyTorch tensors
119 | tokens_tensor = torch.tensor([indexed_tokens])
120 | segments_tensors = torch.tensor([segments_ids])
121 | ```
122 | 
123 | Let's see how to use `BertModel` to get hidden states
124 | 
125 | ```python
126 | # Load pre-trained model (weights)
127 | model = BertModel.from_pretrained('bert-base-uncased')
128 | model.eval()
129 | 
130 | # Predict hidden states features for each layer
131 | encoded_layers, _ = model(tokens_tensor, segments_tensors)
132 | # We have a hidden states for each of the 12 layers in model bert-base-uncased
133 | assert len(encoded_layers) == 12
134 | ```
135 | 
136 | And how to use `BertForMaskedLM`
137 | 
138 | ```python
139 | # Load pre-trained model (weights)
140 | model = BertForMaskedLM.from_pretrained('bert-base-uncased')
141 | model.eval()
142 | 
143 | # Predict all tokens
144 | predictions = model(tokens_tensor, segments_tensors)
145 | 
146 | # confirm we were able to predict 'henson'
147 | predicted_index = torch.argmax(predictions[0, masked_index]).item()
148 | predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
149 | assert predicted_token == 'henson'
150 | ```
151 | 
152 | ## Doc
153 | 
154 | Here is a detailed documentation of the classes in the package and how to use them:
155 | 
156 | | Sub-section | Description |
157 | |-|-|
158 | | [Loading Google AI's pre-trained weigths](#Loading-Google-AIs-pre-trained-weigths-and-PyTorch-dump) | How to load Google AI's pre-trained weight or a PyTorch saved instance |
159 | | [PyTorch models](#PyTorch-models) | API of the eight PyTorch model classes: `BertModel`, `BertForMaskedLM`, `BertForNextSentencePrediction`, `BertForPreTraining`, `BertForSequenceClassification`, `BertForMultipleChoice` or `BertForQuestionAnswering` |
160 | | [Tokenizer: `BertTokenizer`](#Tokenizer-BertTokenizer) | API of the `BertTokenizer` class|
161 | | [Optimizer: `BertAdam`](#Optimizer-BertAdam) |  API of the `BertAdam` class |
162 | 
163 | ### Loading Google AI's pre-trained weigths and PyTorch dump
164 | 
165 | To load one of Google AI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated as
166 | 
167 | ```python
168 | model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None)
169 | ```
170 | 
171 | where
172 | 
173 | - `BERT_CLASS` is either the `BertTokenizer` class (to load the vocabulary) or one of the eight PyTorch model classes (to load the pre-trained weights): `BertModel`, `BertForMaskedLM`, `BertForNextSentencePrediction`, `BertForPreTraining`, `BertForSequenceClassification`, `BertForTokenClassification`, `BertForMultipleChoice` or `BertForQuestionAnswering`, and
174 | - `PRE_TRAINED_MODEL_NAME_OR_PATH` is either:
175 | 
176 |   - the shortcut name of a Google AI's pre-trained model selected in the list:
177 | 
178 |     - `bert-base-uncased`: 12-layer, 768-hidden, 12-heads, 110M parameters
179 |     - `bert-large-uncased`: 24-layer, 1024-hidden, 16-heads, 340M parameters
180 |     - `bert-base-cased`: 12-layer, 768-hidden, 12-heads , 110M parameters
181 |     - `bert-large-cased`: 24-layer, 1024-hidden, 16-heads, 340M parameters
182 |     - `bert-base-multilingual-uncased`: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
183 |     - `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
184 |     - `bert-base-chinese`: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
185 | 
186 |   - a path or url to a pretrained model archive containing:
187 | 
188 |     - `bert_config.json` a configuration file for the model, and
189 |     - `pytorch_model.bin` a PyTorch dump of a pre-trained instance `BertForPreTraining` (saved with the usual `torch.save()`)
190 | 
191 |   If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_pretrained_bert/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_pretrained_bert/`).
192 | - `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
193 | 
194 | `Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository.
195 | 
196 | **When using an `uncased model`, make sure to pass `--do_lower_case` to the example training scripts (or pass `do_lower_case=True` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).**
197 | 
198 | Example:
199 | ```python
200 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
201 | model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
202 | ```
203 | 
204 | ### PyTorch models
205 | 
206 | #### 1. `BertModel`
207 | 
208 | `BertModel` is the basic BERT Transformer model with a layer of summed token, position and sequence embeddings followed by a series of identical self-attention blocks (12 for BERT-base, 24 for BERT-large).
209 | 
210 | The inputs and output are **identical to the TensorFlow model inputs and outputs**.
211 | 
212 | We detail them here. This model takes as *inputs*:
213 | [`modeling.py`](./pytorch_pretrained_bert/modeling.py)
214 | - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary (see the tokens preprocessing logic in the scripts [`extract_features.py`](./examples/extract_features.py), [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`](./examples/run_squad.py)), and
215 | - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
216 | - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
217 | - `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
218 | 
219 | This model *outputs* a tuple composed of:
220 | 
221 | - `encoded_layers`: controled by the value of the `output_encoded_layers` argument:
222 | 
223 |   - `output_all_encoded_layers=True`: outputs a list of the encoded-hidden-states at the end of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
224 |   - `output_all_encoded_layers=False`: outputs only the encoded-hidden-states corresponding to the last attention block, i.e. a single torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
225 | 
226 | - `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a classifier pretrained on top of the hidden state associated to the first character of the input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
227 | 
228 | An example on how to use this class is given in the [`extract_features.py`](./examples/extract_features.py) script which can be used to extract the hidden states of the model for a given input.
229 | 
230 | #### 2. `BertForPreTraining`
231 | 
232 | `BertForPreTraining` includes the `BertModel` Transformer followed by the two pre-training heads:
233 | 
234 | - the masked language modeling head, and
235 | - the next sentence classification head.
236 | 
237 | *Inputs* comprises the inputs of the [`BertModel`](#-1.-`BertModel`) class plus two optional labels:
238 | 
239 | - `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size]
240 | - `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] with indices selected in [0, 1]. 0 => next sentence is the continuation, 1 => next sentence is a random sentence.
241 | 
242 | *Outputs*:
243 | 
244 | - if `masked_lm_labels` and `next_sentence_label` are not `None`: Outputs the total_loss which is the sum of the masked language modeling loss and the next sentence classification loss.
245 | - if `masked_lm_labels` or `next_sentence_label` is `None`: Outputs a tuple comprising
246 | 
247 |   - the masked language modeling logits, and
248 |   - the next sentence classification logits.
249 | 
250 | #### 3. `BertForMaskedLM`
251 | 
252 | `BertForMaskedLM` includes the `BertModel` Transformer followed by the (possibly) pre-trained  masked language modeling head.
253 | 
254 | *Inputs* comprises the inputs of the [`BertModel`](#-1.-`BertModel`) class plus optional label:
255 | 
256 | - `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size]
257 | 
258 | *Outputs*:
259 | 
260 | - if `masked_lm_labels` is not `None`: Outputs the masked language modeling loss.
261 | - if `masked_lm_labels` is `None`: Outputs the masked language modeling logits.
262 | 
263 | #### 4. `BertForNextSentencePrediction`
264 | 
265 | `BertForNextSentencePrediction` includes the `BertModel` Transformer followed by the next sentence classification head.
266 | 
267 | *Inputs* comprises the inputs of the [`BertModel`](#-1.-`BertModel`) class plus an optional label:
268 | 
269 | - `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] with indices selected in [0, 1]. 0 => next sentence is the continuation, 1 => next sentence is a random sentence.
270 | 
271 | *Outputs*:
272 | 
273 | - if `next_sentence_label` is not `None`: Outputs the next sentence classification loss.
274 | - if `next_sentence_label` is `None`: Outputs the next sentence classification logits.
275 | 
276 | #### 5. `BertForSequenceClassification`
277 | 
278 | `BertForSequenceClassification` is a fine-tuning model that includes `BertModel` and a sequence-level (sequence or pair of sequences) classifier on top of the `BertModel`.
279 | 
280 | The sequence-level classifier is a linear layer that takes as input the last hidden state of the first character in the input sequence (see Figures 3a and 3b in the BERT paper).
281 | 
282 | An example on how to use this class is given in the [`run_classifier.py`](./examples/run_classifier.py) script which can be used to fine-tune a single sequence (or pair of sequence) classifier using BERT, for example for the MRPC task.
283 | 
284 | #### 6. `BertForMultipleChoice`
285 | 
286 | `BertForMultipleChoice` is a fine-tuning model that includes `BertModel` and a linear layer on top of the `BertModel`.
287 | 
288 | The linear layer outputs a single value for each choice of a multiple choice problem, then all the outputs corresponding to an instance are passed through a softmax to get the model choice.
289 | 
290 | This implementation is largely inspired by the work of OpenAI in [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) and the answer of Jacob Devlin in the following [issue](https://github.com/google-research/bert/issues/38).
291 | 
292 | An example on how to use this class is given in the [`run_swag.py`](./examples/run_swag.py) script which can be used to fine-tune a multiple choice classifier using BERT, for example for the Swag task.
293 | 
294 | #### 7. `BertForTokenClassification`
295 | 
296 | `BertForTokenClassification` is a fine-tuning model that includes `BertModel` and a token-level classifier on top of the `BertModel`.
297 | 
298 | The token-level classifier is a linear layer that takes as input the last hidden state of the sequence.
299 | 
300 | #### 8. `BertForQuestionAnswering`
301 | 
302 | `BertForQuestionAnswering` is a fine-tuning model that includes `BertModel` with a token-level classifiers on top of the full sequence of last hidden states.
303 | 
304 | The token-level classifier takes as input the full sequence of the last hidden state and compute several (e.g. two) scores for each tokens that can for example respectively be the score that a given token is a `start_span` and a `end_span` token (see Figures 3c and 3d in the BERT paper).
305 | 
306 | An example on how to use this class is given in the [`run_squad.py`](./examples/run_squad.py) script which can be used to fine-tune a token classifier using BERT, for example for the SQuAD task.
307 | 
308 | ### Tokenizer: `BertTokenizer`
309 | 
310 | `BertTokenizer` perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
311 | 
312 | This class has two arguments:
313 | 
314 | - `vocab_file`: path to a vocabulary file.
315 | - `do_lower_case`: convert text to lower-case while tokenizing. **Default = True**.
316 | 
317 | and three methods:
318 | 
319 | - `tokenize(text)`: convert a `str` in a list of `str` tokens by (1) performing basic tokenization and (2) WordPiece tokenization.
320 | - `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary.
321 | - `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
322 | 
323 | Please refer to the doc strings and code in [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) for the details of the `BasicTokenizer` and `WordpieceTokenizer` classes. In general it is recommended to use `BertTokenizer` unless you know what you are doing.
324 | 
325 | ### Optimizer: `BertAdam`
326 | 
327 | `BertAdam` is a `torch.optimizer` adapted to be closer to the optimizer used in the TensorFlow implementation of Bert. The differences with PyTorch Adam optimizer are the following:
328 | 
329 | - BertAdam implements weight decay fix,
330 | - BertAdam doesn't compensate for bias as in the regular Adam optimizer.
331 | 
332 | The optimizer accepts the following arguments:
333 | 
334 | - `lr` : learning rate
335 | - `warmup` : portion of `t_total` for the warmup, `-1`  means no warmup. Default : `-1`
336 | - `t_total` : total number of training steps for the learning
337 |     rate schedule, `-1`  means constant learning rate. Default : `-1`
338 | - `schedule` : schedule to use for the warmup (see above). Default : `'warmup_linear'`
339 | - `b1` : Adams b1. Default : `0.9`
340 | - `b2` : Adams b2. Default : `0.999`
341 | - `e` : Adams epsilon. Default : `1e-6`
342 | - `weight_decay:` Weight decay. Default : `0.01`
343 | - `max_grad_norm` : Maximum norm for the gradients (`-1` means no clipping). Default : `1.0`
344 | 
345 | ## Examples
346 | 
347 | | Sub-section | Description |
348 | |-|-|
349 | | [Training large models: introduction, tools and examples](#Training-large-models-introduction,-tools-and-examples) | How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models |
350 | | [Fine-tuning with BERT: running the examples](#Fine-tuning-with-BERT-running-the-examples) | Running the examples in [`./examples`](./examples/): `extract_classif.py`, `run_classifier.py` and `run_squad.py` |
351 | | [Fine-tuning BERT-large on GPUs](#Fine-tuning-BERT-large-on-GPUs) | How to fine tune `BERT large`|
352 | 
353 | ### Training large models: introduction, tools and examples
354 | 
355 | BERT-base and BERT-large are respectively 110M and 340M parameters models and it can be difficult to fine-tune them on a single GPU with the recommended batch size for good performance (in most case a batch size of 32).
356 | 
357 | To help with fine-tuning these models, we have included several techniques that you can activate in the fine-tuning scripts [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`](./examples/run_squad.py): gradient-accumulation, multi-gpu training, distributed training and 16-bits training . For more details on how to use these techniques you can read [the tips on training large batches in PyTorch](https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255) that I published earlier this month.
358 | 
359 | Here is how to use these techniques in our scripts:
360 | 
361 | - **Gradient Accumulation**: Gradient accumulation can be used by supplying a integer greater than 1 to the `--gradient_accumulation_steps` argument. The batch at each step will be divided by this integer and gradient will be accumulated over `gradient_accumulation_steps` steps.
362 | - **Multi-GPU**: Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs.
363 | - **Distributed training**: Distributed training can be activated by supplying an integer greater or equal to 0 to the `--local_rank` argument (see below).
364 | - **16-bits training**: 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found [here](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) and a full documentation is [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). In our scripts, this option can be activated by setting the `--fp16` flag and you can play with loss scaling using the `--loss_scale` flag (see the previously linked documentation for details on loss scaling). The loss scale can be zero in which case the scale is dynamically adjuted or a positive power of two in which case the scaling is static.
365 | 
366 | To use 16-bits training and distributed training, you need to install NVIDIA's apex extension [as detailed here](https://github.com/nvidia/apex). You will find more information reguarding the internals of `apex` and how to use `apex` in [the doc and the associated repository](https://github.com/nvidia/apex). The results of the tests perfomed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in [the relevant PR of the present repository](https://github.com/huggingface/pytorch-pretrained-BERT/pull/116).
367 | 
368 | Note: To use *Distributed Training*, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see [the above mentioned blog post]((https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255)) for more details):
369 | ```bash
370 | python -m torch.distributed.launch --nproc_per_node=4 --nnodes=2 --node_rank=$THIS_MACHINE_INDEX --master_addr="192.168.1.1" --master_port=1234 run_classifier.py (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
371 | ```
372 | Where `$THIS_MACHINE_INDEX` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address `192.168.1.1` and an open port `1234`.
373 | 
374 | ### Fine-tuning with BERT: running the examples
375 | 
376 | We showcase several fine-tuning examples based on (and extended from) [the original implementation](https://github.com/google-research/bert/):
377 | 
378 | - a *sequence-level classifier* on the MRPC classification corpus,
379 | - a *token-level classifier* on the question answering dataset SQuAD, and
380 | - a *sequence-level multiple-choice classifier* on the SWAG classification corpus.
381 | 
382 | #### MRPC
383 | 
384 | This example code fine-tunes BERT on the Microsoft Research Paraphrase
385 | Corpus (MRPC) corpus and runs in less than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
386 | 
387 | Before running this example you should download the
388 | [GLUE data](https://gluebenchmark.com/tasks) by running
389 | [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
390 | and unpack it to some directory `$GLUE_DIR`.
391 | 
392 | ```shell
393 | export GLUE_DIR=/path/to/glue
394 | 
395 | python run_classifier.py \
396 |   --task_name MRPC \
397 |   --do_train \
398 |   --do_eval \
399 |   --do_lower_case \
400 |   --data_dir $GLUE_DIR/MRPC/ \
401 |   --bert_model bert-base-uncased \
402 |   --max_seq_length 128 \
403 |   --train_batch_size 32 \
404 |   --learning_rate 2e-5 \
405 |   --num_train_epochs 3.0 \
406 |   --output_dir /tmp/mrpc_output/
407 | ```
408 | 
409 | Our test ran on a few seeds with [the original implementation hyper-parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation results between 84% and 88%.
410 | 
411 | **Fast run with apex and 16 bit precision: fine-tuning on MRPC in 27 seconds!**
412 | First install apex as indicated [here](https://github.com/NVIDIA/apex).
413 | Then run
414 | ```shell
415 | export GLUE_DIR=/path/to/glue
416 | 
417 | python run_classifier.py \
418 |   --task_name MRPC \
419 |   --do_train \
420 |   --do_eval \
421 |   --do_lower_case \
422 |   --data_dir $GLUE_DIR/MRPC/ \
423 |   --bert_model bert-base-uncased \
424 |   --max_seq_length 128 \
425 |   --train_batch_size 32 \
426 |   --learning_rate 2e-5 \
427 |   --num_train_epochs 3.0 \
428 |   --output_dir /tmp/mrpc_output/
429 | ```
430 | 
431 | #### SQuAD
432 | 
433 | This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
434 | 
435 | The data for SQuAD can be downloaded with the following links and should be saved in a `$SQUAD_DIR` directory.
436 | 
437 | *   [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
438 | *   [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
439 | *   [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
440 | 
441 | ```shell
442 | export SQUAD_DIR=/path/to/SQUAD
443 | 
444 | python run_squad.py \
445 |   --bert_model bert-base-uncased \
446 |   --do_train \
447 |   --do_predict \
448 |   --do_lower_case \
449 |   --train_file $SQUAD_DIR/train-v1.1.json \
450 |   --predict_file $SQUAD_DIR/dev-v1.1.json \
451 |   --train_batch_size 12 \
452 |   --learning_rate 3e-5 \
453 |   --num_train_epochs 2.0 \
454 |   --max_seq_length 384 \
455 |   --doc_stride 128 \
456 |   --output_dir /tmp/debug_squad/
457 | ```
458 | 
459 | Training with the previous hyper-parameters gave us the following results:
460 | ```bash
461 | {"f1": 88.52381567990474, "exact_match": 81.22043519394512}
462 | ```
463 | 
464 | #### SWAG
465 | 
466 | The data for SWAG can be downloaded by cloning the following [repository](https://github.com/rowanz/swagaf)
467 | 
468 | ```shell
469 | export SWAG_DIR=/path/to/SWAG
470 | 
471 | python run_swag.py \
472 |   --bert_model bert-base-uncased \
473 |   --do_train \
474 |   --do_lower_case \
475 |   --do_eval \
476 |   --data_dir $SWAG_DIR/data \
477 |   --train_batch_size 16 \
478 |   --learning_rate 2e-5 \
479 |   --num_train_epochs 3.0 \
480 |   --max_seq_length 80 \
481 |   --output_dir /tmp/swag_output/ \
482 |   --gradient_accumulation_steps 4
483 | ```
484 | 
485 | Training with the previous hyper-parameters on a single GPU gave us the following results:
486 | ```
487 | eval_accuracy = 0.8062081375587323
488 | eval_loss = 0.5966546792367169
489 | global_step = 13788
490 | loss = 0.06423990014260186
491 | ```
492 | 
493 | ## Fine-tuning BERT-large on GPUs
494 | 
495 | The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation.
496 | 
497 | For example, fine-tuning BERT-large on SQuAD can be done on a server with 4 k-80 (these are pretty old now) in 18 hours. Our results are similar to the TensorFlow implementation results (actually slightly higher):
498 | ```bash
499 | {"exact_match": 84.56953642384106, "f1": 91.04028647786927}
500 | ```
501 | To get these results we used a combination of:
502 | - multi-GPU training (automatically activated on a multi-GPU server),
503 | - 2 steps of gradient accumulation and
504 | - perform the optimization step on CPU to store Adam's averages in RAM.
505 | 
506 | Here is the full list of hyper-parameters for this run:
507 | ```bash
508 | python ./run_squad.py \
509 |   --bert_model bert-large-uncased \
510 |   --do_train \
511 |   --do_predict \
512 |   --do_lower_case \
513 |   --train_file $SQUAD_TRAIN \
514 |   --predict_file $SQUAD_EVAL \
515 |   --learning_rate 3e-5 \
516 |   --num_train_epochs 2 \
517 |   --max_seq_length 384 \
518 |   --doc_stride 128 \
519 |   --output_dir $OUTPUT_DIR \
520 |   --train_batch_size 24 \
521 |   --gradient_accumulation_steps 2 
522 | ```
523 | 
524 | If you have a recent GPU (starting from NVIDIA Volta series), you should try **16-bit fine-tuning** (FP16).
525 | 
526 | Here is an example of hyper-parameters for a FP16 run we tried:
527 | ```bash
528 | python ./run_squad.py \
529 |   --bert_model bert-large-uncased \
530 |   --do_train \
531 |   --do_predict \
532 |   --do_lower_case \
533 |   --train_file $SQUAD_TRAIN \
534 |   --predict_file $SQUAD_EVAL \
535 |   --learning_rate 3e-5 \
536 |   --num_train_epochs 2 \
537 |   --max_seq_length 384 \
538 |   --doc_stride 128 \
539 |   --output_dir $OUTPUT_DIR \
540 |   --train_batch_size 24 \
541 |   --fp16 \
542 |   --loss_scale 128
543 | ```
544 | 
545 | The results were similar to the above FP32 results (actually slightly higher):
546 | ```bash
547 | {"exact_match": 84.65468306527909, "f1": 91.238669287002}
548 | ```
549 | 
550 | ## Notebooks
551 | 
552 | We include [three Jupyter Notebooks](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks) that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
553 | 
554 | - The first NoteBook ([Comparing-TF-and-PT-models.ipynb](./notebooks/Comparing-TF-and-PT-models.ipynb)) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
555 | 
556 | - The second NoteBook ([Comparing-TF-and-PT-models-SQuAD.ipynb](./notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb)) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the `BertForQuestionAnswering` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
557 | 
558 | - The third NoteBook ([Comparing-TF-and-PT-models-MLM-NSP.ipynb](./notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb)) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
559 | 
560 | Please follow the instructions given in the notebooks to run and modify them.
561 | 
562 | ## Command-line interface
563 | 
564 | A command-line interface is provided to convert a TensorFlow checkpoint in a PyTorch dump of the `BertForPreTraining` class  (see above).
565 | 
566 | You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py`](convert_tf_checkpoint_to_pytorch.py) script.
567 | 
568 | This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in [`extract_features.py`](./examples/extract_features.py), [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`]((./examples/run_squad.py))).
569 | 
570 | You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (`bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too.
571 | 
572 | To run this specific conversion script you will need to have TensorFlow and PyTorch installed (`pip install tensorflow`). The rest of the repository only requires PyTorch.
573 | 
574 | Here is an example of the conversion process for a pre-trained `BERT-Base Uncased` model:
575 | 
576 | ```shell
577 | export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
578 | 
579 | pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch \
580 |   $BERT_BASE_DIR/bert_model.ckpt \
581 |   $BERT_BASE_DIR/bert_config.json \
582 |   $BERT_BASE_DIR/pytorch_model.bin
583 | ```
584 | 
585 | You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/bert#pre-trained-models).
586 | 
587 | ## TPU
588 | 
589 | TPU support and pretraining scripts
590 | 
591 | TPU are not supported by the current stable release of PyTorch (0.4.1). However, the next version of PyTorch (v1.0) should support training on TPU and is expected to be released soon (see the recent [official announcement](https://cloud.google.com/blog/products/ai-machine-learning/introducing-pytorch-across-google-cloud)).
592 | 
593 | We will add TPU support when this next release is published.
594 | 
595 | The original TensorFlow code further comprises two scripts for pre-training BERT: [create_pretraining_data.py](https://github.com/google-research/bert/blob/master/create_pretraining_data.py) and [run_pretraining.py](https://github.com/google-research/bert/blob/master/run_pretraining.py).
596 | 
597 | Since, pre-training BERT is a particularly expensive operation that basically requires one or several TPUs to be completed in a reasonable amout of time (see details [here](https://github.com/google-research/bert#pre-training-with-bert)) we have decided to wait for the inclusion of TPU support in PyTorch to convert these pre-training scripts.
598 | 


--------------------------------------------------------------------------------
/code/pytorch_pretrained_bert/modeling.py:
--------------------------------------------------------------------------------
   1 | # coding=utf-8
   2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
   3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
   4 | #
   5 | # Licensed under the Apache License, Version 2.0 (the "License");
   6 | # you may not use this file except in compliance with the License.
   7 | # You may obtain a copy of the License at
   8 | #
   9 | #     http://www.apache.org/licenses/LICENSE-2.0
  10 | #
  11 | # Unless required by applicable law or agreed to in writing, software
  12 | # distributed under the License is distributed on an "AS IS" BASIS,
  13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 | # See the License for the specific language governing permissions and
  15 | # limitations under the License.
  16 | """PyTorch BERT model."""
  17 | 
  18 | from __future__ import absolute_import
  19 | from __future__ import division
  20 | from __future__ import print_function
  21 | 
  22 | import os
  23 | import copy
  24 | import json
  25 | import math
  26 | import logging
  27 | import tarfile
  28 | import tempfile
  29 | import shutil
  30 | 
  31 | import torch
  32 | from torch import nn
  33 | from torch.nn import CrossEntropyLoss
  34 | 
  35 | from .file_utils import cached_path
  36 | 
  37 | logger = logging.getLogger(__name__)
  38 | 
  39 | PRETRAINED_MODEL_ARCHIVE_MAP = {
  40 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
  41 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
  42 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
  43 |     'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
  44 |     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
  45 |     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
  46 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
  47 | }
  48 | CONFIG_NAME = 'bert_config.json'
  49 | WEIGHTS_NAME = 'pytorch_model.bin'
  50 | 
  51 | def gelu(x):
  52 |     """Implementation of the gelu activation function.
  53 |         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
  54 |         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
  55 |     """
  56 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
  57 | 
  58 | 
  59 | def swish(x):
  60 |     return x * torch.sigmoid(x)
  61 | 
  62 | 
  63 | ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
  64 | 
  65 | 
  66 | class BertConfig(object):
  67 |     """Configuration class to store the configuration of a `BertModel`.
  68 |     """
  69 |     def __init__(self,
  70 |                  vocab_size_or_config_json_file,
  71 |                  hidden_size=768,
  72 |                  num_hidden_layers=12,
  73 |                  num_attention_heads=12,
  74 |                  intermediate_size=3072,
  75 |                  hidden_act="gelu",
  76 |                  hidden_dropout_prob=0.1,
  77 |                  attention_probs_dropout_prob=0.1,
  78 |                  max_position_embeddings=512,
  79 |                  type_vocab_size=2,
  80 |                  initializer_range=0.02):
  81 |         """Constructs BertConfig.
  82 | 
  83 |         Args:
  84 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
  85 |             hidden_size: Size of the encoder layers and the pooler layer.
  86 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
  87 |             num_attention_heads: Number of attention heads for each attention layer in
  88 |                 the Transformer encoder.
  89 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
  90 |                 layer in the Transformer encoder.
  91 |             hidden_act: The non-linear activation function (function or string) in the
  92 |                 encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
  93 |             hidden_dropout_prob: The dropout probabilitiy for all fully connected
  94 |                 layers in the embeddings, encoder, and pooler.
  95 |             attention_probs_dropout_prob: The dropout ratio for the attention
  96 |                 probabilities.
  97 |             max_position_embeddings: The maximum sequence length that this model might
  98 |                 ever be used with. Typically set this to something large just in case
  99 |                 (e.g., 512 or 1024 or 2048).
 100 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 101 |                 `BertModel`.
 102 |             initializer_range: The sttdev of the truncated_normal_initializer for
 103 |                 initializing all weight matrices.
 104 |         """
 105 |         if isinstance(vocab_size_or_config_json_file, str):
 106 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
 107 |                 json_config = json.loads(reader.read())
 108 |             for key, value in json_config.items():
 109 |                 self.__dict__[key] = value
 110 |         elif isinstance(vocab_size_or_config_json_file, int):
 111 |             self.vocab_size = vocab_size_or_config_json_file
 112 |             self.hidden_size = hidden_size
 113 |             self.num_hidden_layers = num_hidden_layers
 114 |             self.num_attention_heads = num_attention_heads
 115 |             self.hidden_act = hidden_act
 116 |             self.intermediate_size = intermediate_size
 117 |             self.hidden_dropout_prob = hidden_dropout_prob
 118 |             self.attention_probs_dropout_prob = attention_probs_dropout_prob
 119 |             self.max_position_embeddings = max_position_embeddings
 120 |             self.type_vocab_size = type_vocab_size
 121 |             self.initializer_range = initializer_range
 122 |         else:
 123 |             raise ValueError("First argument must be either a vocabulary size (int)"
 124 |                              "or the path to a pretrained model config file (str)")
 125 | 
 126 |     @classmethod
 127 |     def from_dict(cls, json_object):
 128 |         """Constructs a `BertConfig` from a Python dictionary of parameters."""
 129 |         config = BertConfig(vocab_size_or_config_json_file=-1)
 130 |         for key, value in json_object.items():
 131 |             config.__dict__[key] = value
 132 |         return config
 133 | 
 134 |     @classmethod
 135 |     def from_json_file(cls, json_file):
 136 |         """Constructs a `BertConfig` from a json file of parameters."""
 137 |         with open(json_file, "r", encoding='utf-8') as reader:
 138 |             text = reader.read()
 139 |         return cls.from_dict(json.loads(text))
 140 | 
 141 |     def __repr__(self):
 142 |         return str(self.to_json_string())
 143 | 
 144 |     def to_dict(self):
 145 |         """Serializes this instance to a Python dictionary."""
 146 |         output = copy.deepcopy(self.__dict__)
 147 |         return output
 148 | 
 149 |     def to_json_string(self):
 150 |         """Serializes this instance to a JSON string."""
 151 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 152 | 
 153 | try:
 154 |     from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
 155 | except ImportError:
 156 |     print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
 157 |     class BertLayerNorm(nn.Module):
 158 |         def __init__(self, hidden_size, eps=1e-12):
 159 |             """Construct a layernorm module in the TF style (epsilon inside the square root).
 160 |             """
 161 |             super(BertLayerNorm, self).__init__()
 162 |             self.weight = nn.Parameter(torch.ones(hidden_size))
 163 |             self.bias = nn.Parameter(torch.zeros(hidden_size))
 164 |             self.variance_epsilon = eps
 165 | 
 166 |         def forward(self, x):
 167 |             u = x.mean(-1, keepdim=True)
 168 |             s = (x - u).pow(2).mean(-1, keepdim=True)
 169 |             x = (x - u) / torch.sqrt(s + self.variance_epsilon)
 170 |             return self.weight * x + self.bias
 171 | 
 172 | class BertEmbeddings(nn.Module):
 173 |     """Construct the embeddings from word, position and token_type embeddings.
 174 |     """
 175 |     def __init__(self, config):
 176 |         super(BertEmbeddings, self).__init__()
 177 |         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
 178 |         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
 179 |         self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
 180 | 
 181 |         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
 182 |         # any TensorFlow checkpoint file
 183 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 184 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 185 | 
 186 |     def forward(self, input_ids, token_type_ids=None):
 187 |         seq_length = input_ids.size(1)
 188 |         position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
 189 |         position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 190 |         if token_type_ids is None:
 191 |             token_type_ids = torch.zeros_like(input_ids)
 192 | 
 193 |         words_embeddings = self.word_embeddings(input_ids)
 194 |         position_embeddings = self.position_embeddings(position_ids)
 195 |         token_type_embeddings = self.token_type_embeddings(token_type_ids)
 196 | 
 197 |         embeddings = words_embeddings + position_embeddings + token_type_embeddings
 198 |         embeddings = self.LayerNorm(embeddings)
 199 |         embeddings = self.dropout(embeddings)
 200 |         return embeddings
 201 | 
 202 | 
 203 | class BertSelfAttention(nn.Module):
 204 |     def __init__(self, config):
 205 |         super(BertSelfAttention, self).__init__()
 206 |         if config.hidden_size % config.num_attention_heads != 0:
 207 |             raise ValueError(
 208 |                 "The hidden size (%d) is not a multiple of the number of attention "
 209 |                 "heads (%d)" % (config.hidden_size, config.num_attention_heads))
 210 |         self.num_attention_heads = config.num_attention_heads
 211 |         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
 212 |         self.all_head_size = self.num_attention_heads * self.attention_head_size
 213 | 
 214 |         self.query = nn.Linear(config.hidden_size, self.all_head_size)
 215 |         self.key = nn.Linear(config.hidden_size, self.all_head_size)
 216 |         self.value = nn.Linear(config.hidden_size, self.all_head_size)
 217 | 
 218 |         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 219 | 
 220 |     def transpose_for_scores(self, x):
 221 |         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
 222 |         x = x.view(*new_x_shape)
 223 |         return x.permute(0, 2, 1, 3)
 224 | 
 225 |     def forward(self, hidden_states, attention_mask):
 226 |         mixed_query_layer = self.query(hidden_states)
 227 |         mixed_key_layer = self.key(hidden_states)
 228 |         mixed_value_layer = self.value(hidden_states)
 229 | 
 230 |         query_layer = self.transpose_for_scores(mixed_query_layer)
 231 |         key_layer = self.transpose_for_scores(mixed_key_layer)
 232 |         value_layer = self.transpose_for_scores(mixed_value_layer)
 233 | 
 234 |         # Take the dot product between "query" and "key" to get the raw attention scores.
 235 |         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
 236 |         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
 237 |         # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
 238 |         attention_scores = attention_scores + attention_mask
 239 | 
 240 |         # Normalize the attention scores to probabilities.
 241 |         attention_probs = nn.Softmax(dim=-1)(attention_scores)
 242 | 
 243 |         # This is actually dropping out entire tokens to attend to, which might
 244 |         # seem a bit unusual, but is taken from the original Transformer paper.
 245 |         attention_probs = self.dropout(attention_probs)
 246 | 
 247 |         context_layer = torch.matmul(attention_probs, value_layer)
 248 |         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
 249 |         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
 250 |         context_layer = context_layer.view(*new_context_layer_shape)
 251 |         return context_layer
 252 | 
 253 | 
 254 | class BertSelfOutput(nn.Module):
 255 |     def __init__(self, config):
 256 |         super(BertSelfOutput, self).__init__()
 257 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 258 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 259 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 260 | 
 261 |     def forward(self, hidden_states, input_tensor):
 262 |         hidden_states = self.dense(hidden_states)
 263 |         hidden_states = self.dropout(hidden_states)
 264 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
 265 |         return hidden_states
 266 | 
 267 | 
 268 | class BertAttention(nn.Module):
 269 |     def __init__(self, config):
 270 |         super(BertAttention, self).__init__()
 271 |         self.self = BertSelfAttention(config)
 272 |         self.output = BertSelfOutput(config)
 273 | 
 274 |     def forward(self, input_tensor, attention_mask):
 275 |         self_output = self.self(input_tensor, attention_mask)
 276 |         attention_output = self.output(self_output, input_tensor)
 277 |         return attention_output
 278 | 
 279 | 
 280 | class BertIntermediate(nn.Module):
 281 |     def __init__(self, config):
 282 |         super(BertIntermediate, self).__init__()
 283 |         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
 284 |         self.intermediate_act_fn = ACT2FN[config.hidden_act] \
 285 |             if isinstance(config.hidden_act, str) else config.hidden_act
 286 | 
 287 |     def forward(self, hidden_states):
 288 |         hidden_states = self.dense(hidden_states)
 289 |         hidden_states = self.intermediate_act_fn(hidden_states)
 290 |         return hidden_states
 291 | 
 292 | 
 293 | class BertOutput(nn.Module):
 294 |     def __init__(self, config):
 295 |         super(BertOutput, self).__init__()
 296 |         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
 297 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 298 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 299 | 
 300 |     def forward(self, hidden_states, input_tensor):
 301 |         hidden_states = self.dense(hidden_states)
 302 |         hidden_states = self.dropout(hidden_states)
 303 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
 304 |         return hidden_states
 305 | 
 306 | 
 307 | class BertLayer(nn.Module):
 308 |     def __init__(self, config):
 309 |         super(BertLayer, self).__init__()
 310 |         self.attention = BertAttention(config)
 311 |         self.intermediate = BertIntermediate(config)
 312 |         self.output = BertOutput(config)
 313 | 
 314 |     def forward(self, hidden_states, attention_mask):
 315 |         attention_output = self.attention(hidden_states, attention_mask)
 316 |         intermediate_output = self.intermediate(attention_output)
 317 |         layer_output = self.output(intermediate_output, attention_output)
 318 |         return layer_output
 319 | 
 320 | 
 321 | class BertEncoder(nn.Module):
 322 |     def __init__(self, config):
 323 |         super(BertEncoder, self).__init__()
 324 |         layer = BertLayer(config)
 325 |         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
 326 | 
 327 |     def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
 328 |         all_encoder_layers = []
 329 |         for layer_module in self.layer:
 330 |             hidden_states = layer_module(hidden_states, attention_mask)
 331 |             if output_all_encoded_layers:
 332 |                 all_encoder_layers.append(hidden_states)
 333 |         if not output_all_encoded_layers:
 334 |             all_encoder_layers.append(hidden_states)
 335 |         return all_encoder_layers
 336 | 
 337 | 
 338 | class BertPooler(nn.Module):
 339 |     def __init__(self, config):
 340 |         super(BertPooler, self).__init__()
 341 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 342 |         self.activation = nn.Tanh()
 343 | 
 344 |     def forward(self, hidden_states):
 345 |         # We "pool" the model by simply taking the hidden state corresponding
 346 |         # to the first token.
 347 |         first_token_tensor = hidden_states[:, 0]
 348 |         pooled_output = self.dense(first_token_tensor)
 349 |         pooled_output = self.activation(pooled_output)
 350 |         return pooled_output
 351 | 
 352 | 
 353 | class BertPredictionHeadTransform(nn.Module):
 354 |     def __init__(self, config):
 355 |         super(BertPredictionHeadTransform, self).__init__()
 356 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 357 |         self.transform_act_fn = ACT2FN[config.hidden_act] \
 358 |             if isinstance(config.hidden_act, str) else config.hidden_act
 359 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 360 | 
 361 |     def forward(self, hidden_states):
 362 |         hidden_states = self.dense(hidden_states)
 363 |         hidden_states = self.transform_act_fn(hidden_states)
 364 |         hidden_states = self.LayerNorm(hidden_states)
 365 |         return hidden_states
 366 | 
 367 | 
 368 | class BertLMPredictionHead(nn.Module):
 369 |     def __init__(self, config, bert_model_embedding_weights):
 370 |         super(BertLMPredictionHead, self).__init__()
 371 |         self.transform = BertPredictionHeadTransform(config)
 372 | 
 373 |         # The output weights are the same as the input embeddings, but there is
 374 |         # an output-only bias for each token.
 375 |         self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
 376 |                                  bert_model_embedding_weights.size(0),
 377 |                                  bias=False)
 378 |         self.decoder.weight = bert_model_embedding_weights
 379 |         self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
 380 | 
 381 |     def forward(self, hidden_states):
 382 |         hidden_states = self.transform(hidden_states)
 383 |         hidden_states = self.decoder(hidden_states) + self.bias
 384 |         return hidden_states
 385 | 
 386 | 
 387 | class BertOnlyMLMHead(nn.Module):
 388 |     def __init__(self, config, bert_model_embedding_weights):
 389 |         super(BertOnlyMLMHead, self).__init__()
 390 |         self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
 391 | 
 392 |     def forward(self, sequence_output):
 393 |         prediction_scores = self.predictions(sequence_output)
 394 |         return prediction_scores
 395 | 
 396 | 
 397 | class BertOnlyNSPHead(nn.Module):
 398 |     def __init__(self, config):
 399 |         super(BertOnlyNSPHead, self).__init__()
 400 |         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 401 | 
 402 |     def forward(self, pooled_output):
 403 |         seq_relationship_score = self.seq_relationship(pooled_output)
 404 |         return seq_relationship_score
 405 | 
 406 | 
 407 | class BertPreTrainingHeads(nn.Module):
 408 |     def __init__(self, config, bert_model_embedding_weights):
 409 |         super(BertPreTrainingHeads, self).__init__()
 410 |         self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
 411 |         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 412 | 
 413 |     def forward(self, sequence_output, pooled_output):
 414 |         prediction_scores = self.predictions(sequence_output)
 415 |         seq_relationship_score = self.seq_relationship(pooled_output)
 416 |         return prediction_scores, seq_relationship_score
 417 | 
 418 | 
 419 | class PreTrainedBertModel(nn.Module):
 420 |     """ An abstract class to handle weights initialization and
 421 |         a simple interface for dowloading and loading pretrained models.
 422 |     """
 423 |     def __init__(self, config, *inputs, **kwargs):
 424 |         super(PreTrainedBertModel, self).__init__()
 425 |         if not isinstance(config, BertConfig):
 426 |             raise ValueError(
 427 |                 "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
 428 |                 "To create a model from a Google pretrained model use "
 429 |                 "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
 430 |                     self.__class__.__name__, self.__class__.__name__
 431 |                 ))
 432 |         self.config = config
 433 | 
 434 |     def init_bert_weights(self, module):
 435 |         """ Initialize the weights.
 436 |         """
 437 |         if isinstance(module, (nn.Linear, nn.Embedding)):
 438 |             # Slightly different from the TF version which uses truncated_normal for initialization
 439 |             # cf https://github.com/pytorch/pytorch/pull/5617
 440 |             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
 441 |         elif isinstance(module, BertLayerNorm):
 442 |             module.bias.data.normal_(mean=0.0, std=self.config.initializer_range)
 443 |             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
 444 |         if isinstance(module, nn.Linear) and module.bias is not None:
 445 |             module.bias.data.zero_()
 446 | 
 447 |     @classmethod
 448 |     def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs):
 449 |         """
 450 |         Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict.
 451 |         Download and cache the pre-trained model file if needed.
 452 | 
 453 |         Params:
 454 |             pretrained_model_name: either:
 455 |                 - a str with the name of a pre-trained model to load selected in the list of:
 456 |                     . `bert-base-uncased`
 457 |                     . `bert-large-uncased`
 458 |                     . `bert-base-cased`
 459 |                     . `bert-base-multilingual`
 460 |                     . `bert-base-chinese`
 461 |                 - a path or url to a pretrained model archive containing:
 462 |                     . `bert_config.json` a configuration file for the model
 463 |                     . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
 464 |             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
 465 |             state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
 466 |             *inputs, **kwargs: additional input for the specific Bert class
 467 |                 (ex: num_labels for BertForSequenceClassification)
 468 |         """
 469 |         if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
 470 |             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
 471 |         else:
 472 |             archive_file = pretrained_model_name
 473 |         # redirect to the cache, if necessary
 474 |         resolved_archive_file = archive_file
 475 |         """
 476 |         try:
 477 |             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
 478 |         except FileNotFoundError:
 479 |             logger.error(
 480 |                 "Model name '{}' was not found in model name list ({}). "
 481 |                 "We assumed '{}' was a path or url but couldn't find any file "
 482 |                 "associated to this path or url.".format(
 483 |                     pretrained_model_name,
 484 |                     ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
 485 |                     archive_file))
 486 |             return None
 487 |         """
 488 |         if resolved_archive_file == archive_file:
 489 |             logger.info("loading archive file {}".format(archive_file))
 490 |         else:
 491 |             logger.info("loading archive file {} from cache at {}".format(
 492 |                 archive_file, resolved_archive_file))
 493 |         tempdir = None
 494 |         if os.path.isdir(resolved_archive_file) or resolved_archive_file.startswith('//philly'):
 495 |             serialization_dir = resolved_archive_file
 496 |         else:
 497 |             # Extract archive to temp dir
 498 |             tempdir = tempfile.mkdtemp()
 499 |             logger.info("extracting archive file {} to temp dir {}".format(
 500 |                 resolved_archive_file, tempdir))
 501 |             with tarfile.open(resolved_archive_file, 'r:gz') as archive:
 502 |                 archive.extractall(tempdir)
 503 |             serialization_dir = tempdir
 504 |         # Load config
 505 |         #config_file = os.path.join(serialization_dir, CONFIG_NAME)
 506 |         config_file = '/'.join([serialization_dir, CONFIG_NAME])
 507 |         config = BertConfig.from_json_file(config_file)
 508 |         #logger.info("Model config {}".format(config))
 509 |         # Instantiate model.
 510 |         model = cls(config, *inputs, **kwargs)
 511 |         if state_dict is None:
 512 |             weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
 513 |             state_dict = torch.load(weights_path)
 514 | 
 515 |         old_keys = []
 516 |         new_keys = []
 517 |         for key in state_dict.keys():
 518 |             new_key = None
 519 |             if 'gamma' in key:
 520 |                 new_key = key.replace('gamma', 'weight')
 521 |             if 'beta' in key:
 522 |                 new_key = key.replace('beta', 'bias')
 523 |             if new_key:
 524 |                 old_keys.append(key)
 525 |                 new_keys.append(new_key)
 526 |         for old_key, new_key in zip(old_keys, new_keys):
 527 |             state_dict[new_key] = state_dict.pop(old_key)
 528 | 
 529 |         missing_keys = []
 530 |         unexpected_keys = []
 531 |         error_msgs = []
 532 |         # copy state_dict so _load_from_state_dict can modify it
 533 |         metadata = getattr(state_dict, '_metadata', None)
 534 |         state_dict = state_dict.copy()
 535 |         if metadata is not None:
 536 |             state_dict._metadata = metadata
 537 | 
 538 |         def load(module, prefix=''):
 539 |             local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
 540 |             module._load_from_state_dict(
 541 |                 state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
 542 |             for name, child in module._modules.items():
 543 |                 if child is not None:
 544 |                     load(child, prefix + name + '.')
 545 |         load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
 546 |         if len(missing_keys) > 0:
 547 |             logger.info("Weights of {} not initialized from pretrained model: {}".format(
 548 |                 model.__class__.__name__, missing_keys))
 549 |         if len(unexpected_keys) > 0:
 550 |             logger.info("Weights from pretrained model not used in {}: {}".format(
 551 |                 model.__class__.__name__, unexpected_keys))
 552 |         if tempdir:
 553 |             # Clean up temp dir
 554 |             shutil.rmtree(tempdir)
 555 |         return model
 556 | 
 557 | 
 558 | class BertModel(PreTrainedBertModel):
 559 |     """BERT model ("Bidirectional Embedding Representations from a Transformer").
 560 | 
 561 |     Params:
 562 |         config: a BertConfig class instance with the configuration to build a new model
 563 | 
 564 |     Inputs:
 565 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
 566 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
 567 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 568 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
 569 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
 570 |             a `sentence B` token (see BERT paper for more details).
 571 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
 572 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
 573 |             input sequence length in the current batch. It's the mask that we typically use for attention when
 574 |             a batch has varying length sentences.
 575 |         `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
 576 | 
 577 |     Outputs: Tuple of (encoded_layers, pooled_output)
 578 |         `encoded_layers`: controled by `output_all_encoded_layers` argument:
 579 |             - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
 580 |                 of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
 581 |                 encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
 582 |             - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
 583 |                 to the last attention block of shape [batch_size, sequence_length, hidden_size],
 584 |         `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
 585 |             classifier pretrained on top of the hidden state associated to the first character of the
 586 |             input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
 587 | 
 588 |     Example usage:
 589 |     ```python
 590 |     # Already been converted into WordPiece token ids
 591 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
 592 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
 593 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 594 | 
 595 |     config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
 596 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 597 | 
 598 |     model = modeling.BertModel(config=config)
 599 |     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
 600 |     ```
 601 |     """
 602 |     def __init__(self, config):
 603 |         super(BertModel, self).__init__(config)
 604 |         self.embeddings = BertEmbeddings(config)
 605 |         self.encoder = BertEncoder(config)
 606 |         self.pooler = BertPooler(config)
 607 |         self.apply(self.init_bert_weights)
 608 | 
 609 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
 610 |         if attention_mask is None:
 611 |             attention_mask = torch.ones_like(input_ids)
 612 |         if token_type_ids is None:
 613 |             token_type_ids = torch.zeros_like(input_ids)
 614 | 
 615 |         # We create a 3D attention mask from a 2D tensor mask.
 616 |         # Sizes are [batch_size, 1, 1, to_seq_length]
 617 |         # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
 618 |         # this attention mask is more simple than the triangular masking of causal attention
 619 |         # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
 620 |         extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
 621 | 
 622 |         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
 623 |         # masked positions, this operation will create a tensor which is 0.0 for
 624 |         # positions we want to attend and -10000.0 for masked positions.
 625 |         # Since we are adding it to the raw scores before the softmax, this is
 626 |         # effectively the same as removing these entirely.
 627 |         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
 628 |         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 629 | 
 630 |         embedding_output = self.embeddings(input_ids, token_type_ids)
 631 |         encoded_layers = self.encoder(embedding_output,
 632 |                                       extended_attention_mask,
 633 |                                       output_all_encoded_layers=output_all_encoded_layers)
 634 |         sequence_output = encoded_layers[-1]
 635 |         pooled_output = self.pooler(sequence_output)
 636 |         if not output_all_encoded_layers:
 637 |             encoded_layers = encoded_layers[-1]
 638 |         return encoded_layers, pooled_output
 639 | 
 640 | 
 641 | class BertForPreTraining(PreTrainedBertModel):
 642 |     """BERT model with pre-training heads.
 643 |     This module comprises the BERT model followed by the two pre-training heads:
 644 |         - the masked language modeling head, and
 645 |         - the next sentence classification head.
 646 | 
 647 |     Params:
 648 |         config: a BertConfig class instance with the configuration to build a new model.
 649 | 
 650 |     Inputs:
 651 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
 652 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
 653 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 654 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
 655 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
 656 |             a `sentence B` token (see BERT paper for more details).
 657 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
 658 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
 659 |             input sequence length in the current batch. It's the mask that we typically use for attention when
 660 |             a batch has varying length sentences.
 661 |         `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
 662 |             with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
 663 |             is only computed for the labels set in [0, ..., vocab_size]
 664 |         `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
 665 |             with indices selected in [0, 1].
 666 |             0 => next sentence is the continuation, 1 => next sentence is a random sentence.
 667 | 
 668 |     Outputs:
 669 |         if `masked_lm_labels` and `next_sentence_label` are not `None`:
 670 |             Outputs the total_loss which is the sum of the masked language modeling loss and the next
 671 |             sentence classification loss.
 672 |         if `masked_lm_labels` or `next_sentence_label` is `None`:
 673 |             Outputs a tuple comprising
 674 |             - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
 675 |             - the next sentence classification logits of shape [batch_size, 2].
 676 | 
 677 |     Example usage:
 678 |     ```python
 679 |     # Already been converted into WordPiece token ids
 680 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
 681 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
 682 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 683 | 
 684 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
 685 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 686 | 
 687 |     model = BertForPreTraining(config)
 688 |     masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
 689 |     ```
 690 |     """
 691 |     def __init__(self, config):
 692 |         super(BertForPreTraining, self).__init__(config)
 693 |         self.bert = BertModel(config)
 694 |         self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
 695 |         self.apply(self.init_bert_weights)
 696 | 
 697 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None):
 698 |         sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
 699 |                                                    output_all_encoded_layers=False)
 700 |         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
 701 | 
 702 |         if masked_lm_labels is not None and next_sentence_label is not None:
 703 |             loss_fct = CrossEntropyLoss(ignore_index=-1)
 704 |             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
 705 |             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
 706 |             total_loss = masked_lm_loss + next_sentence_loss
 707 |             return total_loss
 708 |         else:
 709 |             return prediction_scores, seq_relationship_score
 710 | 
 711 | 
 712 | class BertForMaskedLM(PreTrainedBertModel):
 713 |     """BERT model with the masked language modeling head.
 714 |     This module comprises the BERT model followed by the masked language modeling head.
 715 | 
 716 |     Params:
 717 |         config: a BertConfig class instance with the configuration to build a new model.
 718 | 
 719 |     Inputs:
 720 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
 721 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
 722 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 723 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
 724 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
 725 |             a `sentence B` token (see BERT paper for more details).
 726 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
 727 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
 728 |             input sequence length in the current batch. It's the mask that we typically use for attention when
 729 |             a batch has varying length sentences.
 730 |         `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
 731 |             with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
 732 |             is only computed for the labels set in [0, ..., vocab_size]
 733 | 
 734 |     Outputs:
 735 |         if `masked_lm_labels` is `None`:
 736 |             Outputs the masked language modeling loss.
 737 |         if `masked_lm_labels` is `None`:
 738 |             Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
 739 | 
 740 |     Example usage:
 741 |     ```python
 742 |     # Already been converted into WordPiece token ids
 743 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
 744 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
 745 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 746 | 
 747 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
 748 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 749 | 
 750 |     model = BertForMaskedLM(config)
 751 |     masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
 752 |     ```
 753 |     """
 754 |     def __init__(self, config):
 755 |         super(BertForMaskedLM, self).__init__(config)
 756 |         self.bert = BertModel(config)
 757 |         self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
 758 |         self.apply(self.init_bert_weights)
 759 | 
 760 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):
 761 |         sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,
 762 |                                        output_all_encoded_layers=False)
 763 |         prediction_scores = self.cls(sequence_output)
 764 | 
 765 |         if masked_lm_labels is not None:
 766 |             loss_fct = CrossEntropyLoss(ignore_index=-1)
 767 |             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
 768 |             return masked_lm_loss
 769 |         else:
 770 |             return prediction_scores
 771 | 
 772 | 
 773 | class BertForNextSentencePrediction(PreTrainedBertModel):
 774 |     """BERT model with next sentence prediction head.
 775 |     This module comprises the BERT model followed by the next sentence classification head.
 776 | 
 777 |     Params:
 778 |         config: a BertConfig class instance with the configuration to build a new model.
 779 | 
 780 |     Inputs:
 781 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
 782 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
 783 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 784 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
 785 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
 786 |             a `sentence B` token (see BERT paper for more details).
 787 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
 788 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
 789 |             input sequence length in the current batch. It's the mask that we typically use for attention when
 790 |             a batch has varying length sentences.
 791 |         `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
 792 |             with indices selected in [0, 1].
 793 |             0 => next sentence is the continuation, 1 => next sentence is a random sentence.
 794 | 
 795 |     Outputs:
 796 |         if `next_sentence_label` is not `None`:
 797 |             Outputs the total_loss which is the sum of the masked language modeling loss and the next
 798 |             sentence classification loss.
 799 |         if `next_sentence_label` is `None`:
 800 |             Outputs the next sentence classification logits of shape [batch_size, 2].
 801 | 
 802 |     Example usage:
 803 |     ```python
 804 |     # Already been converted into WordPiece token ids
 805 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
 806 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
 807 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 808 | 
 809 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
 810 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 811 | 
 812 |     model = BertForNextSentencePrediction(config)
 813 |     seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
 814 |     ```
 815 |     """
 816 |     def __init__(self, config):
 817 |         super(BertForNextSentencePrediction, self).__init__(config)
 818 |         self.bert = BertModel(config)
 819 |         self.cls = BertOnlyNSPHead(config)
 820 |         self.apply(self.init_bert_weights)
 821 | 
 822 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):
 823 |         _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
 824 |                                      output_all_encoded_layers=False)
 825 |         seq_relationship_score = self.cls( pooled_output)
 826 | 
 827 |         if next_sentence_label is not None:
 828 |             loss_fct = CrossEntropyLoss(ignore_index=-1)
 829 |             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
 830 |             return next_sentence_loss
 831 |         else:
 832 |             return seq_relationship_score
 833 | 
 834 | 
 835 | class BertForSequenceClassification(PreTrainedBertModel):
 836 |     """BERT model for classification.
 837 |     This module is composed of the BERT model with a linear layer on top of
 838 |     the pooled output.
 839 | 
 840 |     Params:
 841 |         `config`: a BertConfig class instance with the configuration to build a new model.
 842 |         `num_labels`: the number of classes for the classifier. Default = 2.
 843 | 
 844 |     Inputs:
 845 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
 846 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
 847 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 848 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
 849 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
 850 |             a `sentence B` token (see BERT paper for more details).
 851 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
 852 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
 853 |             input sequence length in the current batch. It's the mask that we typically use for attention when
 854 |             a batch has varying length sentences.
 855 |         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
 856 |             with indices selected in [0, ..., num_labels].
 857 | 
 858 |     Outputs:
 859 |         if `labels` is not `None`:
 860 |             Outputs the CrossEntropy classification loss of the output with the labels.
 861 |         if `labels` is `None`:
 862 |             Outputs the classification logits of shape [batch_size, num_labels].
 863 | 
 864 |     Example usage:
 865 |     ```python
 866 |     # Already been converted into WordPiece token ids
 867 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
 868 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
 869 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 870 | 
 871 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
 872 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 873 | 
 874 |     num_labels = 2
 875 | 
 876 |     model = BertForSequenceClassification(config, num_labels)
 877 |     logits = model(input_ids, token_type_ids, input_mask)
 878 |     ```
 879 |     """
 880 |     def __init__(self, config, num_labels=2):
 881 |         super(BertForSequenceClassification, self).__init__(config)
 882 |         self.num_labels = num_labels
 883 |         self.bert = BertModel(config)
 884 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 885 |         self.classifier = nn.Linear(config.hidden_size, num_labels)
 886 |         self.apply(self.init_bert_weights)
 887 | 
 888 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
 889 |         _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
 890 |         pooled_output = self.dropout(pooled_output)
 891 |         logits = self.classifier(pooled_output)
 892 | 
 893 |         if labels is not None:
 894 |             loss_fct = CrossEntropyLoss()
 895 |             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 896 |             return loss
 897 |         else:
 898 |             return logits
 899 | 
 900 | 
 901 | class BertForMultipleChoice(PreTrainedBertModel):
 902 |     """BERT model for multiple choice tasks.
 903 |     This module is composed of the BERT model with a linear layer on top of
 904 |     the pooled output.
 905 |     Params:
 906 |         `config`: a BertConfig class instance with the configuration to build a new model.
 907 |         `num_choices`: the number of classes for the classifier. Default = 2.
 908 |     Inputs:
 909 |         `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
 910 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
 911 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 912 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
 913 |             with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
 914 |             and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
 915 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
 916 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
 917 |             input sequence length in the current batch. It's the mask that we typically use for attention when
 918 |             a batch has varying length sentences.
 919 |         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
 920 |             with indices selected in [0, ..., num_choices].
 921 |     Outputs:
 922 |         if `labels` is not `None`:
 923 |             Outputs the CrossEntropy classification loss of the output with the labels.
 924 |         if `labels` is `None`:
 925 |             Outputs the classification logits of shape [batch_size, num_labels].
 926 |     Example usage:
 927 |     ```python
 928 |     # Already been converted into WordPiece token ids
 929 |     input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
 930 |     input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
 931 |     token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
 932 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
 933 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 934 |     num_choices = 2
 935 |     model = BertForMultipleChoice(config, num_choices)
 936 |     logits = model(input_ids, token_type_ids, input_mask)
 937 |     ```
 938 |     """
 939 |     def __init__(self, config, num_choices=2):
 940 |         super(BertForMultipleChoice, self).__init__(config)
 941 |         self.num_choices = num_choices
 942 |         self.bert = BertModel(config)
 943 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 944 |         self.classifier = nn.Linear(config.hidden_size, 1)
 945 |         self.apply(self.init_bert_weights)
 946 | 
 947 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
 948 |         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
 949 |         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
 950 |         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
 951 |         _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
 952 |         pooled_output = self.dropout(pooled_output)
 953 |         logits = self.classifier(pooled_output)
 954 |         reshaped_logits = logits.view(-1, self.num_choices)
 955 | 
 956 |         if labels is not None:
 957 |             loss_fct = CrossEntropyLoss()
 958 |             loss = loss_fct(reshaped_logits, labels)
 959 |             return loss
 960 |         else:
 961 |             return reshaped_logits
 962 | 
 963 | 
 964 | class BertForMultipleChoiceQA(PreTrainedBertModel):
 965 |     """BERT model for multiple choice tasks.
 966 |     This module is composed of the BERT model with a linear layer on top of
 967 |     the pooled output.
 968 | 
 969 |     Params:
 970 |         `config`: a BertConfig class instance with the configuration to build a new model.
 971 |         `num_choices`: the number of classes for the classifier. Default = 2.
 972 | 
 973 |     Inputs:
 974 |         `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
 975 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
 976 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 977 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
 978 |             with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
 979 |             and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
 980 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
 981 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
 982 |             input sequence length in the current batch. It's the mask that we typically use for attention when
 983 |             a batch has varying length sentences.
 984 |         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
 985 |             with indices selected in [0, ..., num_choices].
 986 | 
 987 |     Outputs:
 988 |         if `labels` is not `None`:
 989 |             Outputs the CrossEntropy classification loss of the output with the labels.
 990 |         if `labels` is `None`:
 991 |             Outputs the classification logits of shape [batch_size, num_labels].
 992 | 
 993 |     Example usage:
 994 |     ```python
 995 |     # Already been converted into WordPiece token ids
 996 |     input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
 997 |     input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
 998 |     token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
 999 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
1000 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
1001 | 
1002 |     num_choices = 2
1003 | 
1004 |     model = BertForMultipleChoice(config, num_choices)
1005 |     logits = model(input_ids, token_type_ids, input_mask)
1006 |     ```
1007 |     """
1008 |     def __init__(self, config, num_choices=4):
1009 |         super(BertForMultipleChoice, self).__init__(config)
1010 |         self.num_labels = num_choices
1011 |         self.bert = BertModel(config)
1012 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
1013 |         self.classifier = nn.Linear(config.hidden_size, 1)
1014 |         self.apply(self.init_bert_weights)
1015 | 
1016 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
1017 |         # Bx4xT -> B*4xT
1018 |         batch_size = input_ids.size(0)
1019 |         T = input_ids.size(2)
1020 |         input_ids_2 = input_ids.view(batch_size * 4, T).contiguous()
1021 |         token_type_ids_2 = token_type_ids.view(batch_size * 4, T).contiguous()
1022 |         attention_mask_2 = attention_mask.view(batch_size * 4, T).contiguous()
1023 |         # logger.info("input_ids_2 size: {}".format(input_ids_2.size()))
1024 | 
1025 |         _, pooled_output = self.bert(input_ids_2, token_type_ids_2, attention_mask_2, output_all_encoded_layers=False)
1026 |         pooled_output = self.dropout(pooled_output)
1027 |         # B*4xh -> B*4x1
1028 |         logits = self.classifier(pooled_output)
1029 |         # B*4x1 -> Bx4
1030 |         logits = logits.view(batch_size, 4)
1031 |         # logits = nn.Softmax(dim=-1)(logits)
1032 | 
1033 |         if labels is not None:
1034 |             loss_fct = CrossEntropyLoss()
1035 |             loss = loss_fct(logits, labels.view(-1))  # (Bx4), (B,)
1036 |             return logits, loss
1037 |         else:
1038 |             return logits
1039 | 
1040 | 
1041 | class BertForTokenClassification(PreTrainedBertModel):
1042 |     """BERT model for token-level classification.
1043 |     This module is composed of the BERT model with a linear layer on top of
1044 |     the full hidden state of the last layer.
1045 | 
1046 |     Params:
1047 |         `config`: a BertConfig class instance with the configuration to build a new model.
1048 |         `num_labels`: the number of classes for the classifier. Default = 2.
1049 | 
1050 |     Inputs:
1051 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
1052 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
1053 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
1054 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
1055 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
1056 |             a `sentence B` token (see BERT paper for more details).
1057 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
1058 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
1059 |             input sequence length in the current batch. It's the mask that we typically use for attention when
1060 |             a batch has varying length sentences.
1061 |         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
1062 |             with indices selected in [0, ..., num_labels].
1063 | 
1064 |     Outputs:
1065 |         if `labels` is not `None`:
1066 |             Outputs the CrossEntropy classification loss of the output with the labels.
1067 |         if `labels` is `None`:
1068 |             Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
1069 | 
1070 |     Example usage:
1071 |     ```python
1072 |     # Already been converted into WordPiece token ids
1073 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
1074 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
1075 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
1076 | 
1077 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
1078 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
1079 | 
1080 |     num_labels = 2
1081 | 
1082 |     model = BertForTokenClassification(config, num_labels)
1083 |     logits = model(input_ids, token_type_ids, input_mask)
1084 |     ```
1085 |     """
1086 |     def __init__(self, config, num_labels=2):
1087 |         super(BertForTokenClassification, self).__init__(config)
1088 |         self.num_labels = num_labels
1089 |         self.bert = BertModel(config)
1090 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
1091 |         self.classifier = nn.Linear(config.hidden_size, num_labels)
1092 |         self.apply(self.init_bert_weights)
1093 | 
1094 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
1095 |         sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
1096 |         sequence_output = self.dropout(sequence_output)
1097 |         logits = self.classifier(sequence_output)
1098 | 
1099 |         if labels is not None:
1100 |             loss_fct = CrossEntropyLoss()
1101 |             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1102 |             return loss
1103 |         else:
1104 |             return logits
1105 | 
1106 | 
1107 | class BertForQuestionAnswering(PreTrainedBertModel):
1108 |     """BERT model for Question Answering (span extraction).
1109 |     This module is composed of the BERT model with a linear layer on top of
1110 |     the sequence output that computes start_logits and end_logits
1111 | 
1112 |     Params:
1113 |         `config`: either
1114 |             - a BertConfig class instance with the configuration to build a new model, or
1115 |             - a str with the name of a pre-trained model to load selected in the list of:
1116 |                 . `bert-base-uncased`
1117 |                 . `bert-large-uncased`
1118 |                 . `bert-base-cased`
1119 |                 . `bert-base-multilingual`
1120 |                 . `bert-base-chinese`
1121 |                 The pre-trained model will be downloaded and cached if needed.
1122 | 
1123 |     Inputs:
1124 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
1125 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
1126 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
1127 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
1128 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
1129 |             a `sentence B` token (see BERT paper for more details).
1130 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
1131 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
1132 |             input sequence length in the current batch. It's the mask that we typically use for attention when
1133 |             a batch has varying length sentences.
1134 |         `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
1135 |             Positions are clamped to the length of the sequence and position outside of the sequence are not taken
1136 |             into account for computing the loss.
1137 |         `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
1138 |             Positions are clamped to the length of the sequence and position outside of the sequence are not taken
1139 |             into account for computing the loss.
1140 | 
1141 |     Outputs:
1142 |         if `start_positions` and `end_positions` are not `None`:
1143 |             Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
1144 |         if `start_positions` or `end_positions` is `None`:
1145 |             Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
1146 |             position tokens of shape [batch_size, sequence_length].
1147 | 
1148 |     Example usage:
1149 |     ```python
1150 |     # Already been converted into WordPiece token ids
1151 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
1152 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
1153 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
1154 | 
1155 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
1156 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
1157 | 
1158 |     model = BertForQuestionAnswering(config)
1159 |     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
1160 |     ```
1161 |     """
1162 |     def __init__(self, config):
1163 |         super(BertForQuestionAnswering, self).__init__(config)
1164 |         self.bert = BertModel(config)
1165 |         # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
1166 |         # self.dropout = nn.Dropout(config.hidden_dropout_prob)
1167 |         self.qa_outputs = nn.Linear(config.hidden_size, 2)
1168 |         self.apply(self.init_bert_weights)
1169 | 
1170 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None):
1171 |         sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
1172 |         logits = self.qa_outputs(sequence_output)
1173 |         start_logits, end_logits = logits.split(1, dim=-1)
1174 |         start_logits = start_logits.squeeze(-1)
1175 |         end_logits = end_logits.squeeze(-1)
1176 | 
1177 |         if start_positions is not None and end_positions is not None:
1178 |             # If we are on multi-GPU, split add a dimension
1179 |             if len(start_positions.size()) > 1:
1180 |                 start_positions = start_positions.squeeze(-1)
1181 |             if len(end_positions.size()) > 1:
1182 |                 end_positions = end_positions.squeeze(-1)
1183 |             # sometimes the start/end positions are outside our model inputs, we ignore these terms
1184 |             ignored_index = start_logits.size(1)
1185 |             start_positions.clamp_(0, ignored_index)
1186 |             end_positions.clamp_(0, ignored_index)
1187 | 
1188 |             loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1189 |             start_loss = loss_fct(start_logits, start_positions)
1190 |             end_loss = loss_fct(end_logits, end_positions)
1191 |             total_loss = (start_loss + end_loss) / 2
1192 |             return total_loss
1193 |         else:
1194 |             return start_logits, end_logits
1195 | 


--------------------------------------------------------------------------------