├── data_mutil
├── data
    ├── READ
    └── labels.txt
├── README.md
├── configuration_roberta1.py
├── metrics1.py
├── utils.py
├── tokenization_roberta1.py
├── configuration_bert1.py
├── optimization1.py
├── tokenization_gpt21.py
├── configuration_utils1.py
├── file_utils1.py
├── glue1.py
├── tokenization_bert1.py
├── modeling_roberta1.py
├── run_glue.py
└── modeling_utils1.py


/data_mutil:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/data/READ:
--------------------------------------------------------------------------------
1 | 这是我们的文本存储文件夹
2 | 


--------------------------------------------------------------------------------
/data/labels.txt:
--------------------------------------------------------------------------------
 1 | inform_theater;
 2 | inform_starttime;
 3 | inform_numberofpeople
 4 | greeting;
 5 | thanks
 6 | inform_other
 7 | request_moviename;
 8 | inform_genre
 9 | request_ticket;
10 | inform_city;
11 | inform_state;
12 | inform_date
13 | inform_moviename
14 | confirm_answer;
15 | inform_zip
16 | inform_video_format
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # BERT 对多标签文本分类
 2 | 调用bert-base-uncased的pytorch预训练模型做对一个多标签文本做分类
 3 | 
 4 | 
 5 | 文件夹下包含子文件夹，其中data是我们文本存储文件夹，包含训练、验证和测试的数据。
 6 | 对数据的描述：每个实例包含一句话和它所对应的标签，标签总共包含16个。具体标签内容在data文件夹下的label.txt文件中。
 7 | 首先我们需要下载transformer包，利用pip install transformer或者conda命令即可安装。
 8 | 然后我们需要打开run_glue.py文件，修改文件路径，将文本文件夹路径（data_dir）、模型（bert_base_uncased）路径进行修改。其中模型路径包含三部分内容，第一部分是uncased-model.bin,也就是pytorch版本模型路径。第二部分是模型所对应的json配置文件路径。第三部分就是vocab.txt路径，需要一一在run_glue.py进行修改。
 9 | 
10 | 模型相关的三个文件需要自己下载！
11 | debug  run_glue.py文件 MODEL_ALL中包含三个文件的下载地址！
12 | 


--------------------------------------------------------------------------------
/configuration_roberta1.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 | 
18 | from __future__ import (absolute_import, division, print_function,
19 |                         unicode_literals)
20 | 
21 | import logging
22 | 
23 | from configuration_bert1 import BertConfig
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28 |     'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
29 |     'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
30 |     'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
31 | }
32 | 
33 | 
34 | class RobertaConfig(BertConfig):
35 |     pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
36 | 


--------------------------------------------------------------------------------
/metrics1.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | import csv
18 | import sys
19 | import logging
20 | 
21 | logger = logging.getLogger(__name__)
22 | 
23 | try:
24 |     from scipy.stats import pearsonr, spearmanr
25 |     from sklearn.metrics import matthews_corrcoef, f1_score
26 |     _has_sklearn = True
27 | except (AttributeError, ImportError) as e:
28 |     logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")
29 |     _has_sklearn = False
30 | 
31 | def is_sklearn_available():
32 |     return _has_sklearn
33 | 
34 | if _has_sklearn:
35 | 
36 |     def simple_accuracy(preds, labels):
37 |         return (preds == labels).mean()
38 | 
39 | 
40 |     def acc_and_f1(preds, labels):
41 |         acc = simple_accuracy(preds, labels)
42 |         f1 = f1_score(y_true=labels, y_pred=preds)
43 |         return {
44 |             "acc": acc,
45 |             "f1": f1,
46 |             "acc_and_f1": (acc + f1) / 2,
47 |         }
48 | 
49 | 
50 |     def pearson_and_spearman(preds, labels):
51 |         pearson_corr = pearsonr(preds, labels)[0]
52 |         spearman_corr = spearmanr(preds, labels)[0]
53 |         return {
54 |             "pearson": pearson_corr,
55 |             "spearmanr": spearman_corr,
56 |             "corr": (pearson_corr + spearman_corr) / 2,
57 |         }
58 | 
59 | 
60 |     def glue_compute_metrics(task_name, preds, labels):
61 |         assert len(preds) == len(labels)
62 |         if task_name == "cola":
63 |             return {"mcc": matthews_corrcoef(labels, preds)}
64 |         elif task_name == "sst-2":
65 |             return {"acc": simple_accuracy(preds, labels)}
66 |         elif task_name == "mrpc":
67 |             return acc_and_f1(preds, labels)
68 |         elif task_name == "sts-b":
69 |             return pearson_and_spearman(preds, labels)
70 |         elif task_name == "qqp":
71 |             return acc_and_f1(preds, labels)
72 |         elif task_name == "mnli":
73 |             return {"acc": simple_accuracy(preds, labels)}
74 |         elif task_name == "mnli-mm":
75 |             return {"acc": simple_accuracy(preds, labels)}
76 |         elif task_name == "qnli":
77 |             return {"acc": simple_accuracy(preds, labels)}
78 |         elif task_name == "rte":
79 |             return {"acc": simple_accuracy(preds, labels)}
80 |         elif task_name == "wnli":
81 |             return {"acc": simple_accuracy(preds, labels)}
82 |         elif task_name == "multilabel":
83 |             return {"acc": simple_accuracy(preds, labels)}
84 |         else:
85 |             raise KeyError(task_name)
86 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import csv
 18 | import sys
 19 | import copy
 20 | import json
 21 | 
 22 | class InputExample(object):
 23 |     """
 24 |     A single training/test example for simple sequence classification.
 25 | 
 26 |     Args:
 27 |         guid: Unique id for the example.
 28 |         text_a: string. The untokenized text of the first sequence. For single
 29 |         sequence tasks, only this sequence must be specified.
 30 |         text_b: (Optional) string. The untokenized text of the second sequence.
 31 |         Only must be specified for sequence pair tasks.
 32 |         label: (Optional) string. The label of the example. This should be
 33 |         specified for train and dev examples, but not for test examples.
 34 |     """
 35 |     def __init__(self, guid, text_a, text_b=None, label=None):
 36 |         self.guid = guid
 37 |         self.text_a = text_a
 38 |         self.text_b = text_b
 39 |         self.label = label
 40 | 
 41 | 
 42 | 
 43 | 
 44 | 
 45 |     def __repr__(self):
 46 |         return str(self.to_json_string())
 47 | 
 48 |     def to_dict(self):
 49 |         """Serializes this instance to a Python dictionary."""
 50 |         output = copy.deepcopy(self.__dict__)
 51 |         return output
 52 | 
 53 |     def to_json_string(self):
 54 |         """Serializes this instance to a JSON string."""
 55 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 56 | 
 57 | 
 58 | class InputFeatures(object):
 59 |     """
 60 |     A single set of features of data.
 61 | 
 62 |     Args:
 63 |         input_ids: Indices of input sequence tokens in the vocabulary.
 64 |         attention_mask: Mask to avoid performing attention on padding token indices.
 65 |             Mask values selected in ``[0, 1]``:
 66 |             Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
 67 |         token_type_ids: Segment token indices to indicate first and second portions of the inputs.
 68 |         label: Label corresponding to the input
 69 |     """
 70 | 
 71 |     def __init__(self, input_ids, attention_mask, token_type_ids, label):
 72 |         self.input_ids = input_ids
 73 |         self.attention_mask = attention_mask
 74 |         self.token_type_ids = token_type_ids
 75 |         self.label = label
 76 | 
 77 |     def __repr__(self):
 78 |         return str(self.to_json_string())
 79 | 
 80 |     def to_dict(self):
 81 |         """Serializes this instance to a Python dictionary."""
 82 |         output = copy.deepcopy(self.__dict__)
 83 |         return output
 84 | 
 85 |     def to_json_string(self):
 86 |         """Serializes this instance to a JSON string."""
 87 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 88 | 
 89 | 
 90 | class DataProcessor(object):
 91 |     """Base class for data converters for sequence classification data sets."""
 92 | 
 93 |     def get_example_from_tensor_dict(self, tensor_dict):
 94 |         """Gets an example from a dict with tensorflow tensors
 95 | 
 96 |         Args:
 97 |             tensor_dict: Keys and values should match the corresponding Glue
 98 |                 tensorflow_dataset examples.
 99 |         """
100 |         raise NotImplementedError()
101 | 
102 |     def get_train_examples(self, data_dir):
103 |         """Gets a collection of `InputExample`s for the train set."""
104 |         raise NotImplementedError()
105 | 
106 |     def get_dev_examples(self, data_dir):
107 |         """Gets a collection of `InputExample`s for the dev set."""
108 |         raise NotImplementedError()
109 | 
110 |     def get_labels(self):
111 |         """Gets the list of labels for this data set."""
112 |         raise NotImplementedError()
113 | 
114 |     @classmethod
115 |     def _read_tsv(cls, input_file, quotechar=None):
116 |         """Reads a tab separated value file."""
117 |         with open(input_file, "r", encoding="utf-8-sig") as f:
118 |             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
119 |             lines = []
120 |             for line in reader:
121 |                 if sys.version_info[0] == 2:
122 |                     line = list(unicode(cell, 'utf-8') for cell in line)
123 |                 lines.append(line)
124 |             return lines
125 | 


--------------------------------------------------------------------------------
/tokenization_roberta1.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes for RoBERTa."""
 16 | from __future__ import (absolute_import, division, print_function,
 17 |                         unicode_literals)
 18 | 
 19 | import sys
 20 | import json
 21 | import logging
 22 | import os
 23 | import regex as re
 24 | from io import open
 25 | 
 26 | from tokenization_gpt21 import GPT2Tokenizer
 27 | 
 28 | try:
 29 |     from functools import lru_cache
 30 | except ImportError:
 31 |     # Just a dummy decorator to get the checks to run on python2
 32 |     # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
 33 |     def lru_cache():
 34 |         return lambda func: func
 35 | 
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | VOCAB_FILES_NAMES = {
 39 |     'vocab_file': 'vocab.json',
 40 |     'merges_file': 'merges.txt',
 41 | }
 42 | 
 43 | PRETRAINED_VOCAB_FILES_MAP = {
 44 |     'vocab_file':
 45 |     {
 46 |         'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
 47 |         'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
 48 |         'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
 49 |     },
 50 |     'merges_file':
 51 |     {
 52 |         'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
 53 |         'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
 54 |         'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
 55 |     },
 56 | }
 57 | 
 58 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 59 |     'roberta-base': 512,
 60 |     'roberta-large': 512,
 61 |     'roberta-large-mnli': 512,
 62 | }
 63 | 
 64 | 
 65 | class RobertaTokenizer(GPT2Tokenizer):
 66 |     """
 67 |     RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
 68 |         - Byte-level Byte-Pair-Encoding
 69 |         - Requires a space to start the input string => the encoding methods should be called with the
 70 |           ``add_prefix_space`` flag set to ``True``.
 71 |           Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
 72 |           the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
 73 |     """
 74 |     vocab_files_names = VOCAB_FILES_NAMES
 75 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
 76 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 77 | 
 78 |     def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
 79 |                  cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
 80 |         super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors,
 81 |                                                bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
 82 |                                                sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
 83 |                                                mask_token=mask_token, **kwargs)
 84 |         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
 85 |         self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
 86 | 
 87 |     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
 88 |         """
 89 |         Build model inputs from a sequence or a pair of sequence for sequence classification tasks
 90 |         by concatenating and adding special tokens.
 91 |         A RoBERTa sequence has the following format:
 92 |             single sequence: <s> X </s>
 93 |             pair of sequences: <s> A </s></s> B </s>
 94 |         """
 95 |         if token_ids_1 is None:
 96 |             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 97 |         cls = [self.cls_token_id]
 98 |         sep = [self.sep_token_id]
 99 |         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
100 | 
101 |     def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
102 |         """
103 |         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
104 |         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
105 | 
106 |         Args:
107 |             token_ids_0: list of ids (must not contain special tokens)
108 |             token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
109 |                 for sequence pairs
110 |             already_has_special_tokens: (default False) Set to True if the token list is already formated with
111 |                 special tokens for the model
112 | 
113 |         Returns:
114 |             A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
115 |         """
116 |         if already_has_special_tokens:
117 |             if token_ids_1 is not None:
118 |                 raise ValueError("You should not supply a second sequence if the provided sequence of "
119 |                                  "ids is already formated with special tokens for the model.")
120 |             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
121 | 
122 |         if token_ids_1 is None:
123 |             return [1] + ([0] * len(token_ids_0)) + [1]
124 |         return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
125 | 
126 |     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
127 |         """
128 |         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
129 |         A RoBERTa sequence pair mask has the following format:
130 |         0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
131 |         | first sequence    | second sequence
132 | 
133 |         if token_ids_1 is None, only returns the first portion of the mask (0's).
134 |         """
135 |         sep = [self.sep_token_id]
136 |         cls = [self.cls_token_id]
137 | 
138 |         if token_ids_1 is None:
139 |             return len(cls + token_ids_0 + sep) * [0]
140 |         return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
141 | 


--------------------------------------------------------------------------------
/configuration_bert1.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ BERT model configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from configuration_utils1 import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
 31 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
 32 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
 33 |     'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
 34 |     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
 35 |     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
 36 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
 37 |     'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
 38 |     'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
 39 |     'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
 40 |     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
 41 |     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
 42 |     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
 43 |     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
 44 |     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
 45 | }
 46 | 
 47 | 
 48 | class BertConfig(PretrainedConfig):
 49 |     r"""
 50 |         :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
 51 |         `BertModel`.
 52 | 
 53 | 
 54 |         Arguments:
 55 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
 56 |             hidden_size: Size of the encoder layers and the pooler layer.
 57 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
 58 |             num_attention_heads: Number of attention heads for each attention layer in
 59 |                 the Transformer encoder.
 60 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 61 |                 layer in the Transformer encoder.
 62 |             hidden_act: The non-linear activation function (function or string) in the
 63 |                 encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
 64 |             hidden_dropout_prob: The dropout probabilitiy for all fully connected
 65 |                 layers in the embeddings, encoder, and pooler.
 66 |             attention_probs_dropout_prob: The dropout ratio for the attention
 67 |                 probabilities.
 68 |             max_position_embeddings: The maximum sequence length that this model might
 69 |                 ever be used with. Typically set this to something large just in case
 70 |                 (e.g., 512 or 1024 or 2048).
 71 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 72 |                 `BertModel`.
 73 |             initializer_range: The sttdev of the truncated_normal_initializer for
 74 |                 initializing all weight matrices.
 75 |             layer_norm_eps: The epsilon used by LayerNorm.
 76 |     """
 77 |     pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 78 | 
 79 |     def __init__(self,
 80 |                  vocab_size_or_config_json_file=30522,
 81 |                  hidden_size=768,
 82 |                  num_hidden_layers=12,
 83 |                  num_attention_heads=12,
 84 |                  intermediate_size=3072,
 85 |                  hidden_act="gelu",
 86 |                  hidden_dropout_prob=0.1,
 87 |                  attention_probs_dropout_prob=0.1,
 88 |                  max_position_embeddings=512,
 89 |                  type_vocab_size=2,
 90 |                  initializer_range=0.02,
 91 |                  layer_norm_eps=1e-12,
 92 |                  **kwargs):
 93 |         super(BertConfig, self).__init__(**kwargs)
 94 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
 95 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
 96 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
 97 |                 json_config = json.loads(reader.read())
 98 |             for key, value in json_config.items():
 99 |                 self.__dict__[key] = value
100 |         elif isinstance(vocab_size_or_config_json_file, int):
101 |             self.vocab_size = vocab_size_or_config_json_file
102 |             self.hidden_size = hidden_size
103 |             self.num_hidden_layers = num_hidden_layers
104 |             self.num_attention_heads = num_attention_heads
105 |             self.hidden_act = hidden_act
106 |             self.intermediate_size = intermediate_size
107 |             self.hidden_dropout_prob = hidden_dropout_prob
108 |             self.attention_probs_dropout_prob = attention_probs_dropout_prob
109 |             self.max_position_embeddings = max_position_embeddings
110 |             self.type_vocab_size = type_vocab_size
111 |             self.initializer_range = initializer_range
112 |             self.layer_norm_eps = layer_norm_eps
113 |         else:
114 |             raise ValueError("First argument must be either a vocabulary size (int)"
115 |                              " or the path to a pretrained model config file (str)")
116 | 


--------------------------------------------------------------------------------
/optimization1.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """PyTorch optimization for BERT model."""
 16 | 
 17 | import logging
 18 | import math
 19 | 
 20 | import torch
 21 | from torch.optim import Optimizer
 22 | from torch.optim.lr_scheduler import LambdaLR
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | class ConstantLRSchedule(LambdaLR):
 27 |     """ Constant learning rate schedule.
 28 |     """
 29 |     def __init__(self, optimizer, last_epoch=-1):
 30 |         super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch)
 31 | 
 32 | 
 33 | class WarmupConstantSchedule(LambdaLR):
 34 |     """ Linear warmup and then constant.
 35 |         Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps.
 36 |         Keeps learning rate schedule equal to 1. after warmup_steps.
 37 |     """
 38 |     def __init__(self, optimizer, warmup_steps, last_epoch=-1):
 39 |         self.warmup_steps = warmup_steps
 40 |         super(WarmupConstantSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
 41 | 
 42 |     def lr_lambda(self, step):
 43 |         if step < self.warmup_steps:
 44 |             return float(step) / float(max(1.0, self.warmup_steps))
 45 |         return 1.
 46 | 
 47 | 
 48 | class WarmupLinearSchedule(LambdaLR):
 49 |     """ Linear warmup and then linear decay.
 50 |         Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
 51 |         Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps.
 52 |     """
 53 |     def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
 54 |         self.warmup_steps = warmup_steps
 55 |         self.t_total = t_total
 56 |         super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
 57 | 
 58 |     def lr_lambda(self, step):
 59 |         if step < self.warmup_steps:
 60 |             return float(step) / float(max(1, self.warmup_steps))
 61 |         return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))
 62 | 
 63 | 
 64 | class WarmupCosineSchedule(LambdaLR):
 65 |     """ Linear warmup and then cosine decay.
 66 |         Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
 67 |         Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
 68 |         If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
 69 |     """
 70 |     def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
 71 |         self.warmup_steps = warmup_steps
 72 |         self.t_total = t_total
 73 |         self.cycles = cycles
 74 |         super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
 75 | 
 76 |     def lr_lambda(self, step):
 77 |         if step < self.warmup_steps:
 78 |             return float(step) / float(max(1.0, self.warmup_steps))
 79 |         # progress after warmup
 80 |         progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
 81 |         return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
 82 | 
 83 | 
 84 | class WarmupCosineWithHardRestartsSchedule(LambdaLR):
 85 |     """ Linear warmup and then cosine cycles with hard restarts.
 86 |         Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
 87 |         If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
 88 |         learning rate (with hard restarts).
 89 |     """
 90 |     def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
 91 |         self.warmup_steps = warmup_steps
 92 |         self.t_total = t_total
 93 |         self.cycles = cycles
 94 |         super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
 95 | 
 96 |     def lr_lambda(self, step):
 97 |         if step < self.warmup_steps:
 98 |             return float(step) / float(max(1, self.warmup_steps))
 99 |         # progress after warmup
100 |         progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
101 |         if progress >= 1.0:
102 |             return 0.0
103 |         return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0))))
104 | 
105 | 
106 | 
107 | class AdamW(Optimizer):
108 |     """ Implements Adam algorithm with weight decay fix.
109 | 
110 |     Parameters:
111 |         lr (float): learning rate. Default 1e-3.
112 |         betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999)
113 |         eps (float): Adams epsilon. Default: 1e-6
114 |         weight_decay (float): Weight decay. Default: 0.0
115 |         correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
116 |     """
117 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
118 |         if lr < 0.0:
119 |             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
120 |         if not 0.0 <= betas[0] < 1.0:
121 |             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
122 |         if not 0.0 <= betas[1]  < 1.0:
123 |             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
124 |         if not 0.0 <= eps:
125 |             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
126 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
127 |                         correct_bias=correct_bias)
128 |         super(AdamW, self).__init__(params, defaults)
129 | 
130 |     def step(self, closure=None):
131 |         """Performs a single optimization step.
132 | 
133 |         Arguments:
134 |             closure (callable, optional): A closure that reevaluates the model
135 |                 and returns the loss.
136 |         """
137 |         loss = None
138 |         if closure is not None:
139 |             loss = closure()
140 | 
141 |         for group in self.param_groups:
142 |             for p in group['params']:
143 |                 if p.grad is None:
144 |                     continue
145 |                 grad = p.grad.data
146 |                 if grad.is_sparse:
147 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
148 | 
149 |                 state = self.state[p]
150 | 
151 |                 # State initialization
152 |                 if len(state) == 0:
153 |                     state['step'] = 0
154 |                     # Exponential moving average of gradient values
155 |                     state['exp_avg'] = torch.zeros_like(p.data)
156 |                     # Exponential moving average of squared gradient values
157 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
158 | 
159 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
160 |                 beta1, beta2 = group['betas']
161 | 
162 |                 state['step'] += 1
163 | 
164 |                 # Decay the first and second moment running average coefficient
165 |                 # In-place operations to update the averages at the same time
166 |                 exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
167 |                 exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
168 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
169 | 
170 |                 step_size = group['lr']
171 |                 if group['correct_bias']:  # No bias correction for Bert
172 |                     bias_correction1 = 1.0 - beta1 ** state['step']
173 |                     bias_correction2 = 1.0 - beta2 ** state['step']
174 |                     step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
175 | 
176 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
177 | 
178 |                 # Just adding the square of the weights to the loss function is *not*
179 |                 # the correct way of using L2 regularization/weight decay with Adam,
180 |                 # since that will interact with the m and v parameters in strange ways.
181 |                 #
182 |                 # Instead we want to decay the weights in a manner that doesn't interact
183 |                 # with the m/v parameters. This is equivalent to adding the square
184 |                 # of the weights to the loss with plain (non-momentum) SGD.
185 |                 # Add weight decay at the end (fixed version)
186 |                 if group['weight_decay'] > 0.0:
187 |                     p.data.add_(-group['lr'] * group['weight_decay'], p.data)
188 | 
189 |         return loss
190 | 


--------------------------------------------------------------------------------
/tokenization_gpt21.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes for OpenAI GPT."""
 16 | from __future__ import (absolute_import, division, print_function,
 17 |                         unicode_literals)
 18 | 
 19 | import sys
 20 | import json
 21 | import logging
 22 | import os
 23 | import regex as re
 24 | from io import open
 25 | 
 26 | try:
 27 |     from functools import lru_cache
 28 | except ImportError:
 29 |     # Just a dummy decorator to get the checks to run on python2
 30 |     # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
 31 |     def lru_cache():
 32 |         return lambda func: func
 33 | 
 34 | from tokenization_utils1 import PreTrainedTokenizer
 35 | 
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | VOCAB_FILES_NAMES = {
 39 |     'vocab_file': 'vocab.json',
 40 |     'merges_file': 'merges.txt',
 41 | }
 42 | 
 43 | PRETRAINED_VOCAB_FILES_MAP = {
 44 |     'vocab_file':
 45 |         {
 46 |             'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
 47 |             'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
 48 |             'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
 49 |             'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
 50 |         },
 51 |     'merges_file':
 52 |         {
 53 |             'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
 54 |             'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
 55 |             'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
 56 |             'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
 57 |         },
 58 | }
 59 | 
 60 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 61 |     'gpt2': 1024,
 62 |     'gpt2-medium': 1024,
 63 |     'gpt2-large': 1024,
 64 |     'distilgpt2': 1024,
 65 | }
 66 | 
 67 | 
 68 | @lru_cache()
 69 | def bytes_to_unicode():
 70 |     """
 71 |     Returns list of utf-8 byte and a mapping to unicode strings.
 72 |     We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
 73 | 
 74 |     The reversible bpe codes work on unicode strings.
 75 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 76 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 77 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 78 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 79 |     """
 80 |     _chr = unichr if sys.version_info[0] == 2 else chr
 81 |     bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
 82 |     cs = bs[:]
 83 |     n = 0
 84 |     for b in range(2 ** 8):
 85 |         if b not in bs:
 86 |             bs.append(b)
 87 |             cs.append(2 ** 8 + n)
 88 |             n += 1
 89 |     cs = [_chr(n) for n in cs]
 90 |     return dict(zip(bs, cs))
 91 | 
 92 | 
 93 | def get_pairs(word):
 94 |     """Return set of symbol pairs in a word.
 95 | 
 96 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 97 |     """
 98 |     pairs = set()
 99 |     prev_char = word[0]
100 |     for char in word[1:]:
101 |         pairs.add((prev_char, char))
102 |         prev_char = char
103 |     return pairs
104 | 
105 | 
106 | class GPT2Tokenizer(PreTrainedTokenizer):
107 |     """
108 |     GPT-2 BPE tokenizer. Peculiarities:
109 |         - Byte-level Byte-Pair-Encoding
110 |         - Requires a space to start the input string => the encoding methods should be called with the
111 |           ``add_prefix_space`` flag set to ``True``.
112 |           Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
113 |           the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
114 |     """
115 |     vocab_files_names = VOCAB_FILES_NAMES
116 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
117 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
118 | 
119 |     def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
120 |                  bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
121 |         super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
122 |         self.max_len_single_sentence = self.max_len  # no default special tokens - you can update this value if you add special tokens
123 |         self.max_len_sentences_pair = self.max_len  # no default special tokens - you can update this value if you add special tokens
124 | 
125 |         self.encoder = json.load(open(vocab_file, encoding="utf-8"))
126 |         self.decoder = {v: k for k, v in self.encoder.items()}
127 |         self.errors = errors  # how to handle errors in decoding
128 |         self.byte_encoder = bytes_to_unicode()
129 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
130 |         bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
131 |         bpe_merges = [tuple(merge.split()) for merge in bpe_data]
132 |         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
133 |         self.cache = {}
134 | 
135 |         # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
136 |         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
137 | 
138 |     @property
139 |     def vocab_size(self):
140 |         return len(self.encoder)
141 | 
142 |     def bpe(self, token):
143 |         if token in self.cache:
144 |             return self.cache[token]
145 |         word = tuple(token)
146 |         pairs = get_pairs(word)
147 | 
148 |         if not pairs:
149 |             return token
150 | 
151 |         while True:
152 |             bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
153 |             if bigram not in self.bpe_ranks:
154 |                 break
155 |             first, second = bigram
156 |             new_word = []
157 |             i = 0
158 |             while i < len(word):
159 |                 try:
160 |                     j = word.index(first, i)
161 |                     new_word.extend(word[i:j])
162 |                     i = j
163 |                 except:
164 |                     new_word.extend(word[i:])
165 |                     break
166 | 
167 |                 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
168 |                     new_word.append(first + second)
169 |                     i += 2
170 |                 else:
171 |                     new_word.append(word[i])
172 |                     i += 1
173 |             new_word = tuple(new_word)
174 |             word = new_word
175 |             if len(word) == 1:
176 |                 break
177 |             else:
178 |                 pairs = get_pairs(word)
179 |         word = ' '.join(word)
180 |         self.cache[token] = word
181 |         return word
182 | 
183 |     def _tokenize(self, text, add_prefix_space=False):
184 |         """ Tokenize a string.
185 |             Args:
186 |                 - add_prefix_space (boolean, default False):
187 |                     Begin the sentence with at least one space toto get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
188 |         """
189 |         if add_prefix_space:
190 |             text = ' ' + text
191 | 
192 |         bpe_tokens = []
193 |         for token in re.findall(self.pat, text):
194 |             if sys.version_info[0] == 2:
195 |                 token = ''.join(self.byte_encoder[ord(b)] for b in
196 |                                 token)  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
197 |             else:
198 |                 token = ''.join(self.byte_encoder[b] for b in token.encode(
199 |                     'utf-8'))  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
200 |             bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
201 |         return bpe_tokens
202 | 
203 |     def _convert_token_to_id(self, token):
204 |         """ Converts a token (str/unicode) in an id using the vocab. """
205 |         return self.encoder.get(token, self.encoder.get(self.unk_token))
206 | 
207 |     def _convert_id_to_token(self, index):
208 |         """Converts an index (integer) in a token (string/unicode) using the vocab."""
209 |         return self.decoder.get(index)
210 | 
211 |     def convert_tokens_to_string(self, tokens):
212 |         """ Converts a sequence of tokens (string) in a single string. """
213 |         text = ''.join(tokens)
214 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
215 |         return text
216 | 
217 |     def save_vocabulary(self, save_directory):
218 |         """Save the tokenizer vocabulary and merge files to a directory."""
219 |         if not os.path.isdir(save_directory):
220 |             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
221 |             return
222 |         vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
223 |         merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
224 | 
225 |         with open(vocab_file, 'w', encoding='utf-8') as f:
226 |             f.write(json.dumps(self.encoder, ensure_ascii=False))
227 | 
228 |         index = 0
229 |         with open(merge_file, "w", encoding="utf-8") as writer:
230 |             writer.write(u'#version: 0.2\n')
231 |             for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
232 |                 if index != token_index:
233 |                     logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
234 |                                    " Please check that the tokenizer is not corrupted!".format(merge_file))
235 |                     index = token_index
236 |                 writer.write(' '.join(bpe_tokens) + u'\n')
237 |                 index += 1
238 | 
239 |         return vocab_file, merge_file


--------------------------------------------------------------------------------
/configuration_utils1.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Configuration base class and utilities."""
 17 | 
 18 | from __future__ import (absolute_import, division, print_function,
 19 |                         unicode_literals)
 20 | 
 21 | import copy
 22 | import json
 23 | import logging
 24 | import os
 25 | from io import open
 26 | 
 27 | from file_utils1 import cached_path, CONFIG_NAME
 28 | 
 29 | logger = logging.getLogger(__name__)
 30 | 
 31 | class PretrainedConfig(object):
 32 |     r""" Base class for all configuration classes.
 33 |         Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
 34 | 
 35 |         Note:
 36 |             A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
 37 |             It only affects the model's configuration.
 38 | 
 39 |         Class attributes (overridden by derived classes):
 40 |             - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
 41 | 
 42 |         Parameters:
 43 |             ``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
 44 |             ``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
 45 |             ``output_attentions``: boolean, default `False`. Should the model returns attentions weights.
 46 |             ``output_hidden_states``: string, default `False`. Should the model returns all hidden-states.
 47 |             ``torchscript``: string, default `False`. Is the model used with Torchscript.
 48 |     """
 49 |     pretrained_config_archive_map = {}
 50 | 
 51 |     def __init__(self, **kwargs):
 52 |         self.finetuning_task = kwargs.pop('finetuning_task', None)
 53 |         self.num_labels = kwargs.pop('num_labels', 2)
 54 |         self.output_attentions = kwargs.pop('output_attentions', False)
 55 |         self.output_hidden_states = kwargs.pop('output_hidden_states', False)
 56 |         self.output_past = kwargs.pop('output_past', True)  # Not used by all models
 57 |         self.torchscript = kwargs.pop('torchscript', False)  # Only used by PyTorch models
 58 |         self.use_bfloat16 = kwargs.pop('use_bfloat16', False)
 59 |         self.pruned_heads = kwargs.pop('pruned_heads', {})
 60 | 
 61 |     def save_pretrained(self, save_directory):
 62 |         """ Save a configuration object to the directory `save_directory`, so that it
 63 |             can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
 64 |         """
 65 |         assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
 66 | 
 67 |         # If we save using the predefined names, we can load using `from_pretrained`
 68 |         output_config_file = os.path.join(save_directory, CONFIG_NAME)
 69 | 
 70 |         self.to_json_file(output_config_file)
 71 |         logger.info("Configuration saved in {}".format(output_config_file))
 72 | 
 73 |     @classmethod
 74 |     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 75 |         r""" Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
 76 | 
 77 |         Parameters:
 78 |             pretrained_model_name_or_path: either:
 79 | 
 80 |                 - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
 81 |                 - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
 82 |                 - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
 83 | 
 84 |             cache_dir: (`optional`) string:
 85 |                 Path to a directory in which a downloaded pre-trained model
 86 |                 configuration should be cached if the standard cache should not be used.
 87 | 
 88 |             kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
 89 | 
 90 |                 - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
 91 |                 - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
 92 | 
 93 |             force_download: (`optional`) boolean, default False:
 94 |                 Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
 95 | 
 96 |             proxies: (`optional`) dict, default None:
 97 |                 A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
 98 |                 The proxies are used on each request.
 99 | 
100 |             return_unused_kwargs: (`optional`) bool:
101 | 
102 |                 - If False, then this function returns just the final configuration object.
103 |                 - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
104 | 
105 |         Examples::
106 | 
107 |             # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
108 |             # derived class: BertConfig
109 |             config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
110 |             config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
111 |             config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
112 |             config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
113 |             assert config.output_attention == True
114 |             config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
115 |                                                                foo=False, return_unused_kwargs=True)
116 |             assert config.output_attention == True
117 |             assert unused_kwargs == {'foo': False}
118 | 
119 |         """
120 |         cache_dir = kwargs.pop('cache_dir', None)
121 |         force_download = kwargs.pop('force_download', False)
122 |         proxies = kwargs.pop('proxies', None)
123 |         return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
124 | 
125 |         if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
126 |             config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
127 |         elif os.path.isdir(pretrained_model_name_or_path):
128 |             config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
129 |         else:
130 |             config_file = pretrained_model_name_or_path
131 |         # redirect to the cache, if necessary
132 |         try:
133 |             resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
134 |         except EnvironmentError:
135 |             if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
136 |                 msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
137 |                         config_file)
138 |             else:
139 |                 msg = "Model name '{}' was not found in model name list ({}). " \
140 |                       "We assumed '{}' was a path or url to a configuration file named {} or " \
141 |                       "a directory containing such a file but couldn't find any such file at this path or url.".format(
142 |                         pretrained_model_name_or_path,
143 |                         ', '.join(cls.pretrained_config_archive_map.keys()),
144 |                         config_file, CONFIG_NAME)
145 |             raise EnvironmentError(msg)
146 | 
147 |         if resolved_config_file == config_file:
148 |             logger.info("loading configuration file {}".format(config_file))
149 |         else:
150 |             logger.info("loading configuration file {} from cache at {}".format(
151 |                 config_file, resolved_config_file))
152 | 
153 |         # Load config
154 |         config = cls.from_json_file(resolved_config_file)
155 | 
156 |         if hasattr(config, 'pruned_heads'):
157 |             config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
158 | 
159 |         # Update config with kwargs if needed
160 |         to_remove = []
161 |         for key, value in kwargs.items():
162 |             if hasattr(config, key):
163 |                 setattr(config, key, value)
164 |                 to_remove.append(key)
165 |         for key in to_remove:
166 |             kwargs.pop(key, None)
167 | 
168 |         logger.info("Model config %s", str(config))
169 |         if return_unused_kwargs:
170 |             return config, kwargs
171 |         else:
172 |             return config
173 | 
174 |     @classmethod
175 |     def from_dict(cls, json_object):
176 |         """Constructs a `Config` from a Python dictionary of parameters."""
177 |         config = cls(vocab_size_or_config_json_file=-1)
178 |         for key, value in json_object.items():
179 |             setattr(config, key, value)
180 |         return config
181 | 
182 |     @classmethod
183 |     def from_json_file(cls, json_file):
184 |         """Constructs a `BertConfig` from a json file of parameters."""
185 |         with open(json_file, "r", encoding='utf-8') as reader:
186 |             text = reader.read()
187 |         return cls.from_dict(json.loads(text))
188 | 
189 |     def __eq__(self, other):
190 |         return self.__dict__ == other.__dict__
191 | 
192 |     def __repr__(self):
193 |         return str(self.to_json_string())
194 | 
195 |     def to_dict(self):
196 |         """Serializes this instance to a Python dictionary."""
197 |         output = copy.deepcopy(self.__dict__)
198 |         return output
199 | 
200 |     def to_json_string(self):
201 |         """Serializes this instance to a JSON string."""
202 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
203 | 
204 |     def to_json_file(self, json_file_path):
205 |         """ Save this instance to a json file."""
206 |         with open(json_file_path, "w", encoding='utf-8') as writer:
207 |             writer.write(self.to_json_string())
208 | 


--------------------------------------------------------------------------------
/file_utils1.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for working with the local dataset cache.
  3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
  4 | Copyright by the AllenNLP authors.
  5 | """
  6 | from __future__ import (absolute_import, division, print_function, unicode_literals)
  7 | 
  8 | import sys
  9 | import json
 10 | import logging
 11 | import os
 12 | import six
 13 | import shutil
 14 | import tempfile
 15 | import fnmatch
 16 | from functools import wraps
 17 | from hashlib import sha256
 18 | from io import open
 19 | 
 20 | import boto3
 21 | from botocore.config import Config
 22 | from botocore.exceptions import ClientError
 23 | import requests
 24 | from tqdm import tqdm
 25 | 
 26 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 27 | 
 28 | try:
 29 |     import tensorflow as tf
 30 |     assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2
 31 |     _tf_available = True  # pylint: disable=invalid-name
 32 |     logger.info("TensorFlow version {} available.".format(tf.__version__))
 33 | except (ImportError, AssertionError):
 34 |     _tf_available = False  # pylint: disable=invalid-name
 35 | 
 36 | try:
 37 |     import torch
 38 |     _torch_available = True  # pylint: disable=invalid-name
 39 |     logger.info("PyTorch version {} available.".format(torch.__version__))
 40 | except ImportError:
 41 |     _torch_available = False  # pylint: disable=invalid-name
 42 | 
 43 | 
 44 | try:
 45 |     from torch.hub import _get_torch_home
 46 |     torch_cache_home = _get_torch_home()
 47 | except ImportError:
 48 |     torch_cache_home = os.path.expanduser(
 49 |         os.getenv('TORCH_HOME', os.path.join(
 50 |             os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
 51 | default_cache_path = os.path.join(torch_cache_home, 'transformers')
 52 | 
 53 | try:
 54 |     from urllib.parse import urlparse
 55 | except ImportError:
 56 |     from urlparse import urlparse
 57 | 
 58 | try:
 59 |     from pathlib import Path
 60 |     PYTORCH_PRETRAINED_BERT_CACHE = Path(
 61 |         os.getenv('PYTORCH_TRANSFORMERS_CACHE', os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path)))
 62 | except (AttributeError, ImportError):
 63 |     PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_TRANSFORMERS_CACHE',
 64 |                                               os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
 65 |                                                         default_cache_path))
 66 | 
 67 | PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 68 | TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 69 | 
 70 | WEIGHTS_NAME = "pytorch_model.bin"
 71 | TF2_WEIGHTS_NAME = 'tf_model.h5'
 72 | TF_WEIGHTS_NAME = 'model.ckpt'
 73 | CONFIG_NAME = "config.json"
 74 | 
 75 | def is_torch_available():
 76 |     return _torch_available
 77 | 
 78 | def is_tf_available():
 79 |     return _tf_available
 80 | 
 81 | if not six.PY2:
 82 |     def add_start_docstrings(*docstr):
 83 |         def docstring_decorator(fn):
 84 |             fn.__doc__ = ''.join(docstr) + fn.__doc__
 85 |             return fn
 86 |         return docstring_decorator
 87 | 
 88 |     def add_end_docstrings(*docstr):
 89 |         def docstring_decorator(fn):
 90 |             fn.__doc__ = fn.__doc__ + ''.join(docstr)
 91 |             return fn
 92 |         return docstring_decorator
 93 | else:
 94 |     # Not possible to update class docstrings on python2
 95 |     def add_start_docstrings(*docstr):
 96 |         def docstring_decorator(fn):
 97 |             return fn
 98 |         return docstring_decorator
 99 | 
100 |     def add_end_docstrings(*docstr):
101 |         def docstring_decorator(fn):
102 |             return fn
103 |         return docstring_decorator
104 | 
105 | def url_to_filename(url, etag=None):
106 |     """
107 |     Convert `url` into a hashed filename in a repeatable way.
108 |     If `etag` is specified, append its hash to the url's, delimited
109 |     by a period.
110 |     If the url ends with .h5 (Keras HDF5 weights) ands '.h5' to the name
111 |     so that TF 2.0 can identify it as a HDF5 file
112 |     (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
113 |     """
114 |     url_bytes = url.encode('utf-8')
115 |     url_hash = sha256(url_bytes)
116 |     filename = url_hash.hexdigest()
117 | 
118 |     if etag:
119 |         etag_bytes = etag.encode('utf-8')
120 |         etag_hash = sha256(etag_bytes)
121 |         filename += '.' + etag_hash.hexdigest()
122 | 
123 |     if url.endswith('.h5'):
124 |         filename += '.h5'
125 | 
126 |     return filename
127 | 
128 | 
129 | def filename_to_url(filename, cache_dir=None):
130 |     """
131 |     Return the url and etag (which may be ``None``) stored for `filename`.
132 |     Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
133 |     """
134 |     if cache_dir is None:
135 |         cache_dir = TRANSFORMERS_CACHE
136 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
137 |         cache_dir = str(cache_dir)
138 | 
139 |     cache_path = os.path.join(cache_dir, filename)
140 |     if not os.path.exists(cache_path):
141 |         raise EnvironmentError("file {} not found".format(cache_path))
142 | 
143 |     meta_path = cache_path + '.json'
144 |     if not os.path.exists(meta_path):
145 |         raise EnvironmentError("file {} not found".format(meta_path))
146 | 
147 |     with open(meta_path, encoding="utf-8") as meta_file:
148 |         metadata = json.load(meta_file)
149 |     url = metadata['url']
150 |     etag = metadata['etag']
151 | 
152 |     return url, etag
153 | 
154 | 
155 | def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None):
156 |     """
157 |     Given something that might be a URL (or might be a local path),
158 |     determine which. If it's a URL, download the file and cache it, and
159 |     return the path to the cached file. If it's already a local path,
160 |     make sure the file exists and then return the path.
161 |     Args:
162 |         cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
163 |         force_download: if True, re-dowload the file even if it's already cached in the cache dir.
164 |     """
165 |     if cache_dir is None:
166 |         cache_dir = TRANSFORMERS_CACHE
167 |     if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
168 |         url_or_filename = str(url_or_filename)
169 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
170 |         cache_dir = str(cache_dir)
171 | 
172 |     parsed = urlparse(url_or_filename)
173 | 
174 |     if parsed.scheme in ('http', 'https', 's3'):
175 |         # URL, so get it from the cache (downloading if necessary)
176 |         return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
177 |     elif os.path.exists(url_or_filename):
178 |         # File, and it exists.
179 |         return url_or_filename
180 |     elif parsed.scheme == '':
181 |         # File, but it doesn't exist.
182 |         raise EnvironmentError("file {} not found".format(url_or_filename))
183 |     else:
184 |         # Something unknown
185 |         raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
186 | 
187 | 
188 | def split_s3_path(url):
189 |     """Split a full s3 path into the bucket name and path."""
190 |     parsed = urlparse(url)
191 |     if not parsed.netloc or not parsed.path:
192 |         raise ValueError("bad s3 path {}".format(url))
193 |     bucket_name = parsed.netloc
194 |     s3_path = parsed.path
195 |     # Remove '/' at beginning of path.
196 |     if s3_path.startswith("/"):
197 |         s3_path = s3_path[1:]
198 |     return bucket_name, s3_path
199 | 
200 | 
201 | def s3_request(func):
202 |     """
203 |     Wrapper function for s3 requests in order to create more helpful error
204 |     messages.
205 |     """
206 | 
207 |     @wraps(func)
208 |     def wrapper(url, *args, **kwargs):
209 |         try:
210 |             return func(url, *args, **kwargs)
211 |         except ClientError as exc:
212 |             if int(exc.response["Error"]["Code"]) == 404:
213 |                 raise EnvironmentError("file {} not found".format(url))
214 |             else:
215 |                 raise
216 | 
217 |     return wrapper
218 | 
219 | 
220 | @s3_request
221 | def s3_etag(url, proxies=None):
222 |     """Check ETag on S3 object."""
223 |     s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
224 |     bucket_name, s3_path = split_s3_path(url)
225 |     s3_object = s3_resource.Object(bucket_name, s3_path)
226 |     return s3_object.e_tag
227 | 
228 | 
229 | @s3_request
230 | def s3_get(url, temp_file, proxies=None):
231 |     """Pull a file directly from S3."""
232 |     s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
233 |     bucket_name, s3_path = split_s3_path(url)
234 |     s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
235 | 
236 | 
237 | def http_get(url, temp_file, proxies=None):
238 |     req = requests.get(url, stream=True, proxies=proxies)
239 |     content_length = req.headers.get('Content-Length')
240 |     total = int(content_length) if content_length is not None else None
241 |     progress = tqdm(unit="B", total=total)
242 |     for chunk in req.iter_content(chunk_size=1024):
243 |         if chunk: # filter out keep-alive new chunks
244 |             progress.update(len(chunk))
245 |             temp_file.write(chunk)
246 |     progress.close()
247 | 
248 | 
249 | def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
250 |     """
251 |     Given a URL, look for the corresponding dataset in the local cache.
252 |     If it's not there, download it. Then return the path to the cached file.
253 |     """
254 |     if cache_dir is None:
255 |         cache_dir = TRANSFORMERS_CACHE
256 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
257 |         cache_dir = str(cache_dir)
258 |     if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
259 |         cache_dir = str(cache_dir)
260 | 
261 |     if not os.path.exists(cache_dir):
262 |         os.makedirs(cache_dir)
263 | 
264 |     # Get eTag to add to filename, if it exists.
265 |     if url.startswith("s3://"):
266 |         etag = s3_etag(url, proxies=proxies)
267 |     else:
268 |         try:
269 |             response = requests.head(url, allow_redirects=True, proxies=proxies)
270 |             if response.status_code != 200:
271 |                 etag = None
272 |             else:
273 |                 etag = response.headers.get("ETag")
274 |         except EnvironmentError:
275 |             etag = None
276 | 
277 |     if sys.version_info[0] == 2 and etag is not None:
278 |         etag = etag.decode('utf-8')
279 |     filename = url_to_filename(url, etag)
280 | 
281 |     # get cache path to put the file
282 |     cache_path = os.path.join(cache_dir, filename)
283 | 
284 |     # If we don't have a connection (etag is None) and can't identify the file
285 |     # try to get the last downloaded one
286 |     if not os.path.exists(cache_path) and etag is None:
287 |         matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
288 |         matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))
289 |         if matching_files:
290 |             cache_path = os.path.join(cache_dir, matching_files[-1])
291 | 
292 |     if not os.path.exists(cache_path) or force_download:
293 |         # Download to temporary file, then copy to cache dir once finished.
294 |         # Otherwise you get corrupt cache entries if the download gets interrupted.
295 |         with tempfile.NamedTemporaryFile() as temp_file:
296 |             logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
297 | 
298 |             # GET file object
299 |             if url.startswith("s3://"):
300 |                 s3_get(url, temp_file, proxies=proxies)
301 |             else:
302 |                 http_get(url, temp_file, proxies=proxies)
303 | 
304 |             # we are copying the file before closing it, so flush to avoid truncation
305 |             temp_file.flush()
306 |             # shutil.copyfileobj() starts at the current position, so go to the start
307 |             temp_file.seek(0)
308 | 
309 |             logger.info("copying %s to cache at %s", temp_file.name, cache_path)
310 |             with open(cache_path, 'wb') as cache_file:
311 |                 shutil.copyfileobj(temp_file, cache_file)
312 | 
313 |             logger.info("creating metadata file for %s", cache_path)
314 |             meta = {'url': url, 'etag': etag}
315 |             meta_path = cache_path + '.json'
316 |             with open(meta_path, 'w') as meta_file:
317 |                 output_string = json.dumps(meta)
318 |                 if sys.version_info[0] == 2 and isinstance(output_string, str):
319 |                     output_string = unicode(output_string, 'utf-8')  # The beauty of python 2
320 |                 meta_file.write(output_string)
321 | 
322 |             logger.info("removing temp file %s", temp_file.name)
323 | 
324 |     return cache_path
325 | 


--------------------------------------------------------------------------------
/glue1.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ GLUE processors and helpers """
 17 | 
 18 | import logging
 19 | import os
 20 | import numpy as np
 21 | from utils import DataProcessor, InputExample, InputFeatures
 22 | from file_utils1 import is_tf_available
 23 | 
 24 | if is_tf_available():
 25 |     import tensorflow as tf
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | def glue_convert_examples_to_features(examples, tokenizer,
 31 |                                       max_length=512,
 32 |                                       task=None,
 33 |                                       label_list=None,
 34 |                                       output_mode=None,
 35 |                                       pad_on_left=False,
 36 |                                       pad_token=0,
 37 |                                       pad_token_segment_id=0,
 38 |                                       mask_padding_with_zero=True):
 39 |     """
 40 |     Loads a data file into a list of ``InputFeatures``
 41 | 
 42 |     Args:
 43 |         examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
 44 |         tokenizer: Instance of a tokenizer that will tokenize the examples
 45 |         max_length: Maximum example length
 46 |         task: GLUE task
 47 |         label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
 48 |         output_mode: String indicating the output mode. Either ``regression`` or ``classification``
 49 |         pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
 50 |         pad_token: Padding token
 51 |         pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
 52 |         mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
 53 |             and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
 54 |             actual values)
 55 | 
 56 |     Returns:
 57 |         If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
 58 |         containing the task-specific features. If the input is a list of ``InputExamples``, will return
 59 |         a list of task-specific ``InputFeatures`` which can be fed to the model.
 60 | 
 61 |     """
 62 |     is_tf_dataset = False
 63 |     if is_tf_available() and isinstance(examples, tf.data.Dataset):
 64 |         is_tf_dataset = True
 65 | 
 66 |     if task is not None:
 67 |         processor = glue_processors[task]()
 68 |         if label_list is None:
 69 |             label_list = processor.get_labels()
 70 |             logger.info("Using label list %s for task %s" % (label_list, task))
 71 |         if output_mode is None:
 72 |             output_mode = glue_output_modes[task]
 73 |             logger.info("Using output mode %s for task %s" % (output_mode, task))
 74 | 
 75 |     label_map = {label: i for i, label in enumerate(label_list)}
 76 | 
 77 |     features = []
 78 |     for (ex_index, example) in enumerate(examples):
 79 |         if ex_index % 10000 == 0:
 80 |             logger.info("Writing example %d" % (ex_index))
 81 |         if is_tf_dataset:
 82 |             example = processor.get_example_from_tensor_dict(example)
 83 | 
 84 |         inputs = tokenizer.encode_plus(
 85 |             example.text_a,
 86 |             example.text_b,
 87 |             add_special_tokens=True,
 88 |             max_length=max_length,
 89 |         )
 90 |         input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
 91 | 
 92 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
 93 |         # tokens are attended to.
 94 |         attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
 95 | 
 96 |         # Zero-pad up to the sequence length.
 97 |         padding_length = max_length - len(input_ids)
 98 |         if pad_on_left:
 99 |             input_ids = ([pad_token] * padding_length) + input_ids
100 |             attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
101 |             token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
102 |         else:
103 |             input_ids = input_ids + ([pad_token] * padding_length)
104 |             attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
105 |             token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
106 | 
107 |         assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
108 |         assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
109 |         assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
110 | 
111 |         if output_mode == "classification":
112 |             label = label_map[example.label]
113 |         elif output_mode == "regression":
114 |             label = example.label
115 |         elif output_mode == "MultiLabelclassification":
116 |             label = example.label
117 |         else:
118 |             raise KeyError(output_mode)
119 | 
120 |         if ex_index < 5:
121 |             logger.info("*** Example ***")
122 |             logger.info("guid: %s" % (example.guid))
123 |             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
124 |             logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
125 |             logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
126 |             # logger.info("label: %s (id = %d)" % (example.label, label))
127 | 
128 |         features.append(
129 |                 InputFeatures(input_ids=input_ids,
130 |                               attention_mask=attention_mask,
131 |                               token_type_ids=token_type_ids,
132 |                               label=label))
133 | 
134 |     if is_tf_available() and is_tf_dataset:
135 |         def gen():
136 |             for ex in features:
137 |                 yield  ({'input_ids': ex.input_ids,
138 |                          'attention_mask': ex.attention_mask,
139 |                          'token_type_ids': ex.token_type_ids},
140 |                         ex.label)
141 | 
142 |         return tf.data.Dataset.from_generator(gen,
143 |             ({'input_ids': tf.int32,
144 |               'attention_mask': tf.int32,
145 |               'token_type_ids': tf.int32},
146 |              tf.int64),
147 |             ({'input_ids': tf.TensorShape([None]),
148 |               'attention_mask': tf.TensorShape([None]),
149 |               'token_type_ids': tf.TensorShape([None])},
150 |              tf.TensorShape([])))
151 | 
152 |     return features
153 | 
154 | 
155 | class MrpcProcessor(DataProcessor):
156 |     """Processor for the MRPC data set (GLUE version)."""
157 | 
158 |     def get_example_from_tensor_dict(self, tensor_dict):
159 |         """See base class."""
160 |         return InputExample(tensor_dict['idx'].numpy(),
161 |                             tensor_dict['sentence1'].numpy().decode('utf-8'),
162 |                             tensor_dict['sentence2'].numpy().decode('utf-8'),
163 |                             str(tensor_dict['label'].numpy()))
164 | 
165 |     def get_train_examples(self, data_dir):
166 |         """See base class."""
167 |         logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
168 |         return self._create_examples(
169 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
170 | 
171 |     def get_dev_examples(self, data_dir):
172 |         """See base class."""
173 |         return self._create_examples(
174 |             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
175 | 
176 |     def get_labels(self):
177 |         """See base class."""
178 |         return ["0", "1"]
179 | 
180 |     def _create_examples(self, lines, set_type):
181 |         """Creates examples for the training and dev sets."""
182 |         examples = []
183 |         for (i, line) in enumerate(lines):
184 |             if i == 0:
185 |                 continue
186 |             guid = "%s-%s" % (set_type, i)
187 |             text_a = line[3]
188 |             text_b = line[4]
189 |             label = line[0]
190 |             examples.append(
191 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
192 |         return examples
193 | 
194 | class StsbProcessor(DataProcessor):
195 |     """Processor for the STS-B data set (GLUE version)."""
196 | 
197 |     def get_example_from_tensor_dict(self, tensor_dict):
198 |         """See base class."""
199 |         return InputExample(tensor_dict['idx'].numpy(),
200 |                             tensor_dict['sentence1'].numpy().decode('utf-8'),
201 |                             tensor_dict['sentence2'].numpy().decode('utf-8'),
202 |                             str(tensor_dict['label'].numpy()))
203 | 
204 |     def get_train_examples(self, data_dir):
205 |         """See base class."""
206 |         return self._create_examples(
207 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
208 | 
209 |     def get_dev_examples(self, data_dir):
210 |         """See base class."""
211 |         return self._create_examples(
212 |             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
213 | 
214 |     def get_labels(self):
215 |         """See base class."""
216 |         return [None]
217 | 
218 |     def _create_examples(self, lines, set_type):
219 |         """Creates examples for the training and dev sets."""
220 |         examples = []
221 |         for (i, line) in enumerate(lines):
222 |             if i == 0:
223 |                 continue
224 |             guid = "%s-%s" % (set_type, line[0])
225 |             text_a = line[7]
226 |             text_b = line[8]
227 |             label = line[-1]
228 |             examples.append(
229 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
230 |         return examples
231 | 
232 | class MultiLabelProcessor(DataProcessor):
233 |     """Processor for the MultiNLI data set (GLUE version)."""
234 | 
235 |     def get_example_from_tensor_dict(self, tensor_dict):
236 |         """See base class."""
237 |         return InputExample(tensor_dict['idx'].numpy(),
238 |                             tensor_dict['premise'].numpy().decode('utf-8'),
239 |                             tensor_dict['hypothesis'].numpy().decode('utf-8'),
240 |                             str(tensor_dict['label'].numpy()))
241 | 
242 |     def get_train_examples(self, data_dir):
243 |         """See base class."""
244 |         return self._create_examples(
245 |             self._read_tsv(os.path.join(data_dir, "train_data.tsv")), "train")
246 | 
247 |     def get_dev_examples(self, data_dir):
248 |         """See base class."""
249 |         return self._create_examples(
250 |             self._read_tsv(os.path.join(data_dir, "eval_data.tsv")),
251 |             "dev_matched")
252 | 
253 |     def get_labels(self):
254 |         """See base class."""
255 |         return ["0", "1", "2","3","4","5","6", "7", "8","9","10","11","12", "13", "14","15"]
256 | 
257 |     def _create_examples(self, lines, set_type):
258 |         """Creates examples for the training and dev sets."""
259 |         examples = []
260 |         for (i, line) in enumerate(lines):
261 |             guid = "%s" % (set_type)
262 |             text_a = line[0]
263 |             label=np.zeros((16,), dtype=int)
264 |             label_sum=["inform_theater","inform_starttime","inform_numberofpeople","greeting","thanks","inform_other","request_moviename","inform_genre","request_ticket",
265 |                        "inform_city","inform_state","inform_date","inform_moviename","confirm_answer","inform_zip","inform_video_format"]
266 |             for i in range(16):
267 |                 if label_sum[i] in line:
268 |                     label[i]=1
269 |             examples.append(
270 |                 InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
271 | 
272 |         return examples
273 | 
274 | glue_tasks_num_labels = {
275 |     "mrpc": 2,
276 |     "sts-b": 1,
277 |     "multilabel":16,
278 | }
279 | 
280 | glue_processors = {
281 |     "mrpc": MrpcProcessor,
282 |     "sts-b": StsbProcessor,
283 |     "multilabel": MultiLabelProcessor,
284 | }
285 | 
286 | glue_output_modes = {
287 |     "mrpc": "classification",
288 |     "sts-b": "regression",
289 |     "multilabel": "MultiLabelclassification",
290 | }
291 | 


--------------------------------------------------------------------------------
/tokenization_bert1.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | 
 19 | import collections
 20 | import logging
 21 | import os
 22 | import unicodedata
 23 | from io import open
 24 | 
 25 | from tokenization_utils1 import PreTrainedTokenizer
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
 30 | 
 31 | PRETRAINED_VOCAB_FILES_MAP = {
 32 |     'vocab_file':
 33 |     {
 34 |         'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
 35 |         'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
 36 |         'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
 37 |         'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
 38 |         'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
 39 |         'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
 40 |         'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
 41 |         'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
 42 |         'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
 43 |         'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
 44 |         'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
 45 |         'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
 46 |         'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
 47 |         'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
 48 |         'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
 49 |     }
 50 | }
 51 | 
 52 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 53 |     'bert-base-uncased': 512,
 54 |     'bert-large-uncased': 512,
 55 |     'bert-base-cased': 512,
 56 |     'bert-large-cased': 512,
 57 |     'bert-base-multilingual-uncased': 512,
 58 |     'bert-base-multilingual-cased': 512,
 59 |     'bert-base-chinese': 512,
 60 |     'bert-base-german-cased': 512,
 61 |     'bert-large-uncased-whole-word-masking': 512,
 62 |     'bert-large-cased-whole-word-masking': 512,
 63 |     'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
 64 |     'bert-large-cased-whole-word-masking-finetuned-squad': 512,
 65 |     'bert-base-cased-finetuned-mrpc': 512,
 66 |     'bert-base-german-dbmdz-cased': 512,
 67 |     'bert-base-german-dbmdz-uncased': 512,
 68 | }
 69 | 
 70 | PRETRAINED_INIT_CONFIGURATION = {
 71 |     'bert-base-uncased': {'do_lower_case': True},
 72 |     'bert-large-uncased': {'do_lower_case': True},
 73 |     'bert-base-cased': {'do_lower_case': False},
 74 |     'bert-large-cased': {'do_lower_case': False},
 75 |     'bert-base-multilingual-uncased': {'do_lower_case': True},
 76 |     'bert-base-multilingual-cased': {'do_lower_case': False},
 77 |     'bert-base-chinese': {'do_lower_case': False},
 78 |     'bert-base-german-cased': {'do_lower_case': False},
 79 |     'bert-large-uncased-whole-word-masking': {'do_lower_case': True},
 80 |     'bert-large-cased-whole-word-masking': {'do_lower_case': False},
 81 |     'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True},
 82 |     'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False},
 83 |     'bert-base-cased-finetuned-mrpc': {'do_lower_case': False},
 84 |     'bert-base-german-dbmdz-cased': {'do_lower_case': False},
 85 |     'bert-base-german-dbmdz-uncased': {'do_lower_case': True},
 86 | }
 87 | 
 88 | 
 89 | def load_vocab(vocab_file):
 90 |     """Loads a vocabulary file into a dictionary."""
 91 |     vocab = collections.OrderedDict()
 92 |     with open(vocab_file, "r", encoding="utf-8") as reader:
 93 |         tokens = reader.readlines()
 94 |     for index, token in enumerate(tokens):
 95 |         token = token.rstrip('\n')
 96 |         vocab[token] = index
 97 |     return vocab
 98 | 
 99 | 
100 | def whitespace_tokenize(text):
101 |     """Runs basic whitespace cleaning and splitting on a piece of text."""
102 |     text = text.strip()
103 |     if not text:
104 |         return []
105 |     tokens = text.split()
106 |     return tokens
107 | 
108 | 
109 | class BertTokenizer(PreTrainedTokenizer):
110 |     r"""
111 |     Constructs a BertTokenizer.
112 |     :class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
113 | 
114 |     Args:
115 |         vocab_file: Path to a one-wordpiece-per-line vocabulary file
116 |         do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
117 |         do_basic_tokenize: Whether to do basic tokenization before wordpiece.
118 |         max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
119 |             minimum of this value (if specified) and the underlying BERT model's sequence length.
120 |         never_split: List of tokens which will never be split during tokenization. Only has an effect when
121 |             do_wordpiece_only=False
122 |     """
123 | 
124 |     vocab_files_names = VOCAB_FILES_NAMES
125 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
126 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
127 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
128 | 
129 |     def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
130 |                  unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
131 |                  mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs):
132 |         """Constructs a BertTokenizer.
133 | 
134 |         Args:
135 |             **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
136 |             **do_lower_case**: (`optional`) boolean (default True)
137 |                 Whether to lower case the input
138 |                 Only has an effect when do_basic_tokenize=True
139 |             **do_basic_tokenize**: (`optional`) boolean (default True)
140 |                 Whether to do basic tokenization before wordpiece.
141 |             **never_split**: (`optional`) list of string
142 |                 List of tokens which will never be split during tokenization.
143 |                 Only has an effect when do_basic_tokenize=True
144 |             **tokenize_chinese_chars**: (`optional`) boolean (default True)
145 |                 Whether to tokenize Chinese characters.
146 |                 This should likely be deactivated for Japanese:
147 |                 see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
148 |         """
149 |         super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
150 |                                             pad_token=pad_token, cls_token=cls_token,
151 |                                             mask_token=mask_token, **kwargs)
152 |         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
153 |         self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
154 | 
155 |         if not os.path.isfile(vocab_file):
156 |             raise ValueError(
157 |                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
158 |                 "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
159 |         self.vocab = load_vocab(vocab_file)
160 |         self.ids_to_tokens = collections.OrderedDict(
161 |             [(ids, tok) for tok, ids in self.vocab.items()])
162 |         self.do_basic_tokenize = do_basic_tokenize
163 |         if do_basic_tokenize:
164 |             self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
165 |                                                   never_split=never_split,
166 |                                                   tokenize_chinese_chars=tokenize_chinese_chars)
167 |         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
168 | 
169 |     @property
170 |     def vocab_size(self):
171 |         return len(self.vocab)
172 | 
173 |     def _tokenize(self, text):
174 |         split_tokens = []
175 |         if self.do_basic_tokenize:
176 |             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
177 |                 for sub_token in self.wordpiece_tokenizer.tokenize(token):
178 |                     split_tokens.append(sub_token)
179 |         else:
180 |             split_tokens = self.wordpiece_tokenizer.tokenize(text)
181 |         return split_tokens
182 | 
183 |     def _convert_token_to_id(self, token):
184 |         """ Converts a token (str/unicode) in an id using the vocab. """
185 |         return self.vocab.get(token, self.vocab.get(self.unk_token))
186 | 
187 |     def _convert_id_to_token(self, index):
188 |         """Converts an index (integer) in a token (string/unicode) using the vocab."""
189 |         return self.ids_to_tokens.get(index, self.unk_token)
190 | 
191 |     def convert_tokens_to_string(self, tokens):
192 |         """ Converts a sequence of tokens (string) in a single string. """
193 |         out_string = ' '.join(tokens).replace(' ##', '').strip()
194 |         return out_string
195 | 
196 |     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
197 |         """
198 |         Build model inputs from a sequence or a pair of sequence for sequence classification tasks
199 |         by concatenating and adding special tokens.
200 |         A BERT sequence has the following format:
201 |             single sequence: [CLS] X [SEP]
202 |             pair of sequences: [CLS] A [SEP] B [SEP]
203 |         """
204 |         if token_ids_1 is None:
205 |             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
206 |         cls = [self.cls_token_id]
207 |         sep = [self.sep_token_id]
208 |         return cls + token_ids_0 + sep + token_ids_1 + sep
209 | 
210 |     def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
211 |         """
212 |         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
213 |         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
214 | 
215 |         Args:
216 |             token_ids_0: list of ids (must not contain special tokens)
217 |             token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
218 |                 for sequence pairs
219 |             already_has_special_tokens: (default False) Set to True if the token list is already formated with
220 |                 special tokens for the model
221 | 
222 |         Returns:
223 |             A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
224 |         """
225 | 
226 |         if already_has_special_tokens:
227 |             if token_ids_1 is not None:
228 |                 raise ValueError("You should not supply a second sequence if the provided sequence of "
229 |                                  "ids is already formated with special tokens for the model.")
230 |             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
231 | 
232 |         if token_ids_1 is not None:
233 |             return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
234 |         return [1] + ([0] * len(token_ids_0)) + [1]
235 | 
236 |     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
237 |         """
238 |         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
239 |         A BERT sequence pair mask has the following format:
240 |         0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
241 |         | first sequence    | second sequence
242 | 
243 |         if token_ids_1 is None, only returns the first portion of the mask (0's).
244 |         """
245 |         sep = [self.sep_token_id]
246 |         cls = [self.cls_token_id]
247 |         if token_ids_1 is None:
248 |             return len(cls + token_ids_0 + sep) * [0]
249 |         return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
250 | 
251 |     def save_vocabulary(self, vocab_path):
252 |         """Save the tokenizer vocabulary to a directory or file."""
253 |         index = 0
254 |         if os.path.isdir(vocab_path):
255 |             vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
256 |         else:
257 |             vocab_file = vocab_path
258 |         with open(vocab_file, "w", encoding="utf-8") as writer:
259 |             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
260 |                 if index != token_index:
261 |                     logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
262 |                                    " Please check that the vocabulary is not corrupted!".format(vocab_file))
263 |                     index = token_index
264 |                 writer.write(token + u'\n')
265 |                 index += 1
266 |         return (vocab_file,)
267 | 
268 | 
269 | class BasicTokenizer(object):
270 |     """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
271 | 
272 |     def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
273 |         """ Constructs a BasicTokenizer.
274 | 
275 |         Args:
276 |             **do_lower_case**: Whether to lower case the input.
277 |             **never_split**: (`optional`) list of str
278 |                 Kept for backward compatibility purposes.
279 |                 Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
280 |                 List of token not to split.
281 |             **tokenize_chinese_chars**: (`optional`) boolean (default True)
282 |                 Whether to tokenize Chinese characters.
283 |                 This should likely be deactivated for Japanese:
284 |                 see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
285 |         """
286 |         if never_split is None:
287 |             never_split = []
288 |         self.do_lower_case = do_lower_case
289 |         self.never_split = never_split
290 |         self.tokenize_chinese_chars = tokenize_chinese_chars
291 | 
292 |     def tokenize(self, text, never_split=None):
293 |         """ Basic Tokenization of a piece of text.
294 |             Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
295 | 
296 |         Args:
297 |             **never_split**: (`optional`) list of str
298 |                 Kept for backward compatibility purposes.
299 |                 Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
300 |                 List of token not to split.
301 |         """
302 |         never_split = self.never_split + (never_split if never_split is not None else [])
303 |         text = self._clean_text(text)
304 |         # This was added on November 1st, 2018 for the multilingual and Chinese
305 |         # models. This is also applied to the English models now, but it doesn't
306 |         # matter since the English models were not trained on any Chinese data
307 |         # and generally don't have any Chinese data in them (there are Chinese
308 |         # characters in the vocabulary because Wikipedia does have some Chinese
309 |         # words in the English Wikipedia.).
310 |         if self.tokenize_chinese_chars:
311 |             text = self._tokenize_chinese_chars(text)
312 |         orig_tokens = whitespace_tokenize(text)
313 |         split_tokens = []
314 |         for token in orig_tokens:
315 |             if self.do_lower_case and token not in never_split:
316 |                 token = token.lower()
317 |                 token = self._run_strip_accents(token)
318 |             split_tokens.extend(self._run_split_on_punc(token))
319 | 
320 |         output_tokens = whitespace_tokenize(" ".join(split_tokens))
321 |         return output_tokens
322 | 
323 |     def _run_strip_accents(self, text):
324 |         """Strips accents from a piece of text."""
325 |         text = unicodedata.normalize("NFD", text)
326 |         output = []
327 |         for char in text:
328 |             cat = unicodedata.category(char)
329 |             if cat == "Mn":
330 |                 continue
331 |             output.append(char)
332 |         return "".join(output)
333 | 
334 |     def _run_split_on_punc(self, text, never_split=None):
335 |         """Splits punctuation on a piece of text."""
336 |         if never_split is not None and text in never_split:
337 |             return [text]
338 |         chars = list(text)
339 |         i = 0
340 |         start_new_word = True
341 |         output = []
342 |         while i < len(chars):
343 |             char = chars[i]
344 |             if _is_punctuation(char):
345 |                 output.append([char])
346 |                 start_new_word = True
347 |             else:
348 |                 if start_new_word:
349 |                     output.append([])
350 |                 start_new_word = False
351 |                 output[-1].append(char)
352 |             i += 1
353 | 
354 |         return ["".join(x) for x in output]
355 | 
356 |     def _tokenize_chinese_chars(self, text):
357 |         """Adds whitespace around any CJK character."""
358 |         output = []
359 |         for char in text:
360 |             cp = ord(char)
361 |             if self._is_chinese_char(cp):
362 |                 output.append(" ")
363 |                 output.append(char)
364 |                 output.append(" ")
365 |             else:
366 |                 output.append(char)
367 |         return "".join(output)
368 | 
369 |     def _is_chinese_char(self, cp):
370 |         """Checks whether CP is the codepoint of a CJK character."""
371 |         # This defines a "chinese character" as anything in the CJK Unicode block:
372 |         #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
373 |         #
374 |         # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
375 |         # despite its name. The modern Korean Hangul alphabet is a different block,
376 |         # as is Japanese Hiragana and Katakana. Those alphabets are used to write
377 |         # space-separated words, so they are not treated specially and handled
378 |         # like the all of the other languages.
379 |         if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
380 |                 (cp >= 0x3400 and cp <= 0x4DBF) or  #
381 |                 (cp >= 0x20000 and cp <= 0x2A6DF) or  #
382 |                 (cp >= 0x2A700 and cp <= 0x2B73F) or  #
383 |                 (cp >= 0x2B740 and cp <= 0x2B81F) or  #
384 |                 (cp >= 0x2B820 and cp <= 0x2CEAF) or
385 |                 (cp >= 0xF900 and cp <= 0xFAFF) or  #
386 |                 (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
387 |             return True
388 | 
389 |         return False
390 | 
391 |     def _clean_text(self, text):
392 |         """Performs invalid character removal and whitespace cleanup on text."""
393 |         output = []
394 |         for char in text:
395 |             cp = ord(char)
396 |             if cp == 0 or cp == 0xfffd or _is_control(char):
397 |                 continue
398 |             if _is_whitespace(char):
399 |                 output.append(" ")
400 |             else:
401 |                 output.append(char)
402 |         return "".join(output)
403 | 
404 | 
405 | class WordpieceTokenizer(object):
406 |     """Runs WordPiece tokenization."""
407 | 
408 |     def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
409 |         self.vocab = vocab
410 |         self.unk_token = unk_token
411 |         self.max_input_chars_per_word = max_input_chars_per_word
412 | 
413 |     def tokenize(self, text):
414 |         """Tokenizes a piece of text into its word pieces.
415 | 
416 |         This uses a greedy longest-match-first algorithm to perform tokenization
417 |         using the given vocabulary.
418 | 
419 |         For example:
420 |           input = "unaffable"
421 |           output = ["un", "##aff", "##able"]
422 | 
423 |         Args:
424 |           text: A single token or whitespace separated tokens. This should have
425 |             already been passed through `BasicTokenizer`.
426 | 
427 |         Returns:
428 |           A list of wordpiece tokens.
429 |         """
430 | 
431 |         output_tokens = []
432 |         for token in whitespace_tokenize(text):
433 |             chars = list(token)
434 |             if len(chars) > self.max_input_chars_per_word:
435 |                 output_tokens.append(self.unk_token)
436 |                 continue
437 | 
438 |             is_bad = False
439 |             start = 0
440 |             sub_tokens = []
441 |             while start < len(chars):
442 |                 end = len(chars)
443 |                 cur_substr = None
444 |                 while start < end:
445 |                     substr = "".join(chars[start:end])
446 |                     if start > 0:
447 |                         substr = "##" + substr
448 |                     if substr in self.vocab:
449 |                         cur_substr = substr
450 |                         break
451 |                     end -= 1
452 |                 if cur_substr is None:
453 |                     is_bad = True
454 |                     break
455 |                 sub_tokens.append(cur_substr)
456 |                 start = end
457 | 
458 |             if is_bad:
459 |                 output_tokens.append(self.unk_token)
460 |             else:
461 |                 output_tokens.extend(sub_tokens)
462 |         return output_tokens
463 | 
464 | 
465 | def _is_whitespace(char):
466 |     """Checks whether `chars` is a whitespace character."""
467 |     # \t, \n, and \r are technically contorl characters but we treat them
468 |     # as whitespace since they are generally considered as such.
469 |     if char == " " or char == "\t" or char == "\n" or char == "\r":
470 |         return True
471 |     cat = unicodedata.category(char)
472 |     if cat == "Zs":
473 |         return True
474 |     return False
475 | 
476 | 
477 | def _is_control(char):
478 |     """Checks whether `chars` is a control character."""
479 |     # These are technically control characters but we count them as whitespace
480 |     # characters.
481 |     if char == "\t" or char == "\n" or char == "\r":
482 |         return False
483 |     cat = unicodedata.category(char)
484 |     if cat.startswith("C"):
485 |         return True
486 |     return False
487 | 
488 | 
489 | def _is_punctuation(char):
490 |     """Checks whether `chars` is a punctuation character."""
491 |     cp = ord(char)
492 |     # We treat all non-letter/number ASCII as punctuation.
493 |     # Characters such as "^", "$", and "`" are not in the Unicode
494 |     # Punctuation class but we treat them as punctuation anyways, for
495 |     # consistency.
496 |     if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
497 |             (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
498 |         return True
499 |     cat = unicodedata.category(char)
500 |     if cat.startswith("P"):
501 |         return True
502 |     return False
503 | 


--------------------------------------------------------------------------------
/modeling_roberta1.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """PyTorch RoBERTa model. """
 17 | 
 18 | from __future__ import (absolute_import, division, print_function,
 19 |                         unicode_literals)
 20 | 
 21 | import logging
 22 | 
 23 | import torch
 24 | import torch.nn as nn
 25 | from torch.nn import CrossEntropyLoss, MSELoss
 26 | 
 27 | from modeling_bert1 import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
 28 | from configuration_roberta1 import RobertaConfig
 29 | from file_utils1 import add_start_docstrings
 30 | 
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
 34 |     'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
 35 |     'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
 36 |     'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
 37 | }
 38 | 
 39 | 
 40 | class RobertaEmbeddings(BertEmbeddings):
 41 |     """
 42 |     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
 43 |     """
 44 | 
 45 |     def __init__(self, config):
 46 |         super(RobertaEmbeddings, self).__init__(config)
 47 |         self.padding_idx = 1
 48 |         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
 49 |         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
 50 |                                                 padding_idx=self.padding_idx)
 51 | 
 52 |     def forward(self, input_ids, token_type_ids=None, position_ids=None):
 53 |         seq_length = input_ids.size(1)
 54 |         if position_ids is None:
 55 |             # Position numbers begin at padding_idx+1. Padding symbols are ignored.
 56 |             # cf. fairseq's `utils.make_positions`
 57 |             position_ids = torch.arange(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=torch.long,
 58 |                                         device=input_ids.device)
 59 |             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 60 |         return super(RobertaEmbeddings, self).forward(input_ids,
 61 |                                                       token_type_ids=token_type_ids,
 62 |                                                       position_ids=position_ids)
 63 | 
 64 | 
 65 | ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
 66 |     `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_
 67 |     by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
 68 |     Veselin Stoyanov. It is based on Google's BERT model released in 2018.
 69 | 
 70 |     It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
 71 |     objective and training with much larger mini-batches and learning rates.
 72 | 
 73 |     This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained 
 74 |     models.
 75 | 
 76 |     This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
 77 |     refer to the PyTorch documentation for all matter related to general usage and behavior.
 78 | 
 79 |     .. _`RoBERTa: A Robustly Optimized BERT Pretraining Approach`:
 80 |         https://arxiv.org/abs/1907.11692
 81 | 
 82 |     .. _`torch.nn.Module`:
 83 |         https://pytorch.org/docs/stable/nn.html#module
 84 | 
 85 |     Parameters:
 86 |         config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the 
 87 |             model. Initializing with a config file does not load the weights associated with the model, only the configuration.
 88 |             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 89 | """
 90 | 
 91 | ROBERTA_INPUTS_DOCSTRING = r"""
 92 |     Inputs:
 93 |         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
 94 |             Indices of input sequence tokens in the vocabulary.
 95 |             To match pre-training, RoBERTa input sequence should be formatted with <s> and </s> tokens as follows:
 96 | 
 97 |             (a) For sequence pairs:
 98 | 
 99 |                 ``tokens:         <s> Is this Jacksonville ? </s> </s> No it is not . </s>``
100 | 
101 |             (b) For single sequences:
102 | 
103 |                 ``tokens:         <s> the dog is hairy . </s>``
104 | 
105 |             Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with 
106 |             the ``add_special_tokens`` parameter set to ``True``.
107 | 
108 |             RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
109 |             the right rather than the left.
110 | 
111 |             See :func:`transformers.PreTrainedTokenizer.encode` and
112 |             :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
113 |         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
114 |             Mask to avoid performing attention on padding token indices.
115 |             Mask values selected in ``[0, 1]``:
116 |             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
117 |         **token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
118 |             Optional segment token indices to indicate first and second portions of the inputs.
119 |             This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it
120 |             during finetuning.
121 |             Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
122 |             corresponds to a `sentence B` token
123 |             (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
124 |         **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
125 |             Indices of positions of each input sequence tokens in the position embeddings.
126 |             Selected in the range ``[0, config.max_position_embeddings - 1[``.
127 |         **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
128 |             Mask to nullify selected heads of the self-attention modules.
129 |             Mask values selected in ``[0, 1]``:
130 |             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
131 | """
132 | 
133 | 
134 | @add_start_docstrings(
135 |     "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
136 |     ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
137 | class RobertaModel(BertModel):
138 |     r"""
139 |     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
140 |         **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
141 |             Sequence of hidden-states at the output of the last layer of the model.
142 |         **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
143 |             Last layer hidden-state of the first token of the sequence (classification token)
144 |             further processed by a Linear layer and a Tanh activation function. The Linear
145 |             layer weights are trained from the next sentence prediction (classification)
146 |             objective during Bert pretraining. This output is usually *not* a good summary
147 |             of the semantic content of the input, you're often better with averaging or pooling
148 |             the sequence of hidden-states for the whole input sequence.
149 |         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
150 |             list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
151 |             of shape ``(batch_size, sequence_length, hidden_size)``:
152 |             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
153 |         **attentions**: (`optional`, returned when ``config.output_attentions=True``)
154 |             list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
155 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
156 | 
157 |     Examples::
158 | 
159 |         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
160 |         model = RobertaModel.from_pretrained('roberta-base')
161 |         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
162 |         outputs = model(input_ids)
163 |         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
164 | 
165 |     """
166 |     config_class = RobertaConfig
167 |     pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
168 |     base_model_prefix = "roberta"
169 | 
170 |     def __init__(self, config):
171 |         super(RobertaModel, self).__init__(config)
172 | 
173 |         self.embeddings = RobertaEmbeddings(config)
174 |         self.init_weights()
175 | 
176 |     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
177 |         if input_ids[:, 0].sum().item() != 0:
178 |             logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
179 |                            "This model requires special tokens in order to work. "
180 |                            "Please specify add_special_tokens=True in your tokenize.encode()"
181 |                            "or tokenizer.convert_tokens_to_ids().")
182 |         return super(RobertaModel, self).forward(input_ids,
183 |                                                  attention_mask=attention_mask,
184 |                                                  token_type_ids=token_type_ids,
185 |                                                  position_ids=position_ids,
186 |                                                  head_mask=head_mask)
187 | 
188 | 
189 | @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
190 |                       ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
191 | class RobertaForMaskedLM(BertPreTrainedModel):
192 |     r"""
193 |         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
194 |             Labels for computing the masked language modeling loss.
195 |             Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
196 |             Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
197 |             in ``[0, ..., config.vocab_size]``
198 | 
199 |     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
200 |         **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
201 |             Masked language modeling loss.
202 |         **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
203 |             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
204 |         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
205 |             list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
206 |             of shape ``(batch_size, sequence_length, hidden_size)``:
207 |             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
208 |         **attentions**: (`optional`, returned when ``config.output_attentions=True``)
209 |             list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
210 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
211 | 
212 |     Examples::
213 | 
214 |         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
215 |         model = RobertaForMaskedLM.from_pretrained('roberta-base')
216 |         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
217 |         outputs = model(input_ids, masked_lm_labels=input_ids)
218 |         loss, prediction_scores = outputs[:2]
219 | 
220 |     """
221 |     config_class = RobertaConfig
222 |     pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
223 |     base_model_prefix = "roberta"
224 | 
225 |     def __init__(self, config):
226 |         super(RobertaForMaskedLM, self).__init__(config)
227 | 
228 |         self.roberta = RobertaModel(config)
229 |         self.lm_head = RobertaLMHead(config)
230 | 
231 |         self.init_weights()
232 |         self.tie_weights()
233 | 
234 |     def tie_weights(self):
235 |         """ Make sure we are sharing the input and output embeddings.
236 |             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
237 |         """
238 |         self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings)
239 | 
240 |     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
241 |                 masked_lm_labels=None):
242 |         outputs = self.roberta(input_ids,
243 |                                attention_mask=attention_mask,
244 |                                token_type_ids=token_type_ids,
245 |                                position_ids=position_ids,
246 |                                head_mask=head_mask)
247 |         sequence_output = outputs[0]
248 |         prediction_scores = self.lm_head(sequence_output)
249 | 
250 |         outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
251 | 
252 |         if masked_lm_labels is not None:
253 |             loss_fct = CrossEntropyLoss(ignore_index=-1)
254 |             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
255 |             outputs = (masked_lm_loss,) + outputs
256 | 
257 |         return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
258 | 
259 | 
260 | class RobertaLMHead(nn.Module):
261 |     """Roberta Head for masked language modeling."""
262 | 
263 |     def __init__(self, config):
264 |         super(RobertaLMHead, self).__init__()
265 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
266 |         self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
267 | 
268 |         self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
269 |         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
270 | 
271 |     def forward(self, features, **kwargs):
272 |         x = self.dense(features)
273 |         x = gelu(x)
274 |         x = self.layer_norm(x)
275 | 
276 |         # project back to size of vocabulary with bias
277 |         x = self.decoder(x) + self.bias
278 | 
279 |         return x
280 | 
281 | 
282 | @add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
283 |     on top of the pooled output) e.g. for GLUE tasks. """,
284 |                       ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
285 | class RobertaForSequenceClassification(BertPreTrainedModel):
286 |     r"""
287 |         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
288 |             Labels for computing the sequence classification/regression loss.
289 |             Indices should be in ``[0, ..., config.num_labels]``.
290 |             If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
291 |             If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
292 | 
293 |     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
294 |         **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
295 |             Classification (or regression if config.num_labels==1) loss.
296 |         **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
297 |             Classification (or regression if config.num_labels==1) scores (before SoftMax).
298 |         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
299 |             list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
300 |             of shape ``(batch_size, sequence_length, hidden_size)``:
301 |             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
302 |         **attentions**: (`optional`, returned when ``config.output_attentions=True``)
303 |             list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
304 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
305 | 
306 |     Examples::
307 | 
308 |         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
309 |         model = RobertaForSequenceClassification.from_pretrained('roberta-base')
310 |         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
311 |         labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
312 |         outputs = model(input_ids, labels=labels)
313 |         loss, logits = outputs[:2]
314 | 
315 |     """
316 |     config_class = RobertaConfig
317 |     pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
318 |     base_model_prefix = "roberta"
319 | 
320 |     def __init__(self, config):
321 |         super(RobertaForSequenceClassification, self).__init__(config)
322 |         self.num_labels = config.num_labels
323 | 
324 |         self.roberta = RobertaModel(config)
325 |         self.classifier = RobertaClassificationHead(config)
326 | 
327 |     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
328 |                 labels=None):
329 |         outputs = self.roberta(input_ids,
330 |                                attention_mask=attention_mask,
331 |                                token_type_ids=token_type_ids,
332 |                                position_ids=position_ids,
333 |                                head_mask=head_mask)
334 |         sequence_output = outputs[0]
335 |         logits = self.classifier(sequence_output)
336 | 
337 |         outputs = (logits,) + outputs[2:]
338 |         if labels is not None:
339 |             if self.num_labels == 1:
340 |                 #  We are doing regression
341 |                 loss_fct = MSELoss()
342 |                 loss = loss_fct(logits.view(-1), labels.view(-1))
343 |             else:
344 |                 loss_fct = CrossEntropyLoss()
345 |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
346 |             outputs = (loss,) + outputs
347 | 
348 |         return outputs  # (loss), logits, (hidden_states), (attentions)
349 | 
350 | 
351 | @add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of
352 |     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
353 |                       ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
354 | class RobertaForMultipleChoice(BertPreTrainedModel):
355 |     r"""
356 |     Inputs:
357 |         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
358 |             Indices of input sequence tokens in the vocabulary.
359 |             The second dimension of the input (`num_choices`) indicates the number of choices to score.
360 |             To match pre-training, RoBerta input sequence should be formatted with [CLS] and [SEP] tokens as follows:
361 | 
362 |             (a) For sequence pairs:
363 | 
364 |                 ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] [SEP] no it is not . [SEP]``
365 | 
366 |                 ``token_type_ids:   0   0  0    0    0     0       0   0   0     1  1  1  1   1   1``
367 | 
368 |             (b) For single sequences:
369 | 
370 |                 ``tokens:         [CLS] the dog is hairy . [SEP]``
371 | 
372 |                 ``token_type_ids:   0   0   0   0  0     0   0``
373 | 
374 |             Indices can be obtained using :class:`transformers.BertTokenizer`.
375 |             See :func:`transformers.PreTrainedTokenizer.encode` and
376 |             :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
377 |         **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
378 |             Segment token indices to indicate first and second portions of the inputs.
379 |             The second dimension of the input (`num_choices`) indicates the number of choices to score.
380 |             Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
381 |         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
382 |             Mask to avoid performing attention on padding token indices.
383 |             The second dimension of the input (`num_choices`) indicates the number of choices to score.
384 |             Mask values selected in ``[0, 1]``:
385 |             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
386 |         **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
387 |             Mask to nullify selected heads of the self-attention modules.
388 |             Mask values selected in ``[0, 1]``:
389 |             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
390 |         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
391 |             Labels for computing the multiple choice classification loss.
392 |             Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
393 |             of the input tensors. (see `input_ids` above)
394 | 
395 |     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
396 |         **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
397 |             Classification loss.
398 |         **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
399 |             of the input tensors. (see `input_ids` above).
400 |             Classification scores (before SoftMax).
401 |         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
402 |             list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
403 |             of shape ``(batch_size, sequence_length, hidden_size)``:
404 |             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
405 |         **attentions**: (`optional`, returned when ``config.output_attentions=True``)
406 |             list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
407 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
408 | 
409 |     Examples::
410 | 
411 |         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
412 |         model = RobertaForMultipleChoice.from_pretrained('roberta-base')
413 |         choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
414 |         input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
415 |         labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
416 |         outputs = model(input_ids, labels=labels)
417 |         loss, classification_scores = outputs[:2]
418 | 
419 |     """
420 |     config_class = RobertaConfig
421 |     pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
422 |     base_model_prefix = "roberta"
423 | 
424 |     def __init__(self, config):
425 |         super(RobertaForMultipleChoice, self).__init__(config)
426 | 
427 |         self.roberta = RobertaModel(config)
428 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
429 |         self.classifier = nn.Linear(config.hidden_size, 1)
430 | 
431 |         self.init_weights()
432 | 
433 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
434 |                 position_ids=None, head_mask=None):
435 |         num_choices = input_ids.shape[1]
436 | 
437 |         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
438 |         flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
439 |         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
440 |         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
441 |         outputs = self.roberta(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids,
442 |                                attention_mask=flat_attention_mask, head_mask=head_mask)
443 |         pooled_output = outputs[1]
444 | 
445 |         pooled_output = self.dropout(pooled_output)
446 |         logits = self.classifier(pooled_output)
447 |         reshaped_logits = logits.view(-1, num_choices)
448 | 
449 |         outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
450 | 
451 |         if labels is not None:
452 |             loss_fct = CrossEntropyLoss()
453 |             loss = loss_fct(reshaped_logits, labels)
454 |             outputs = (loss,) + outputs
455 | 
456 |         return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
457 | 
458 | 
459 | class RobertaClassificationHead(nn.Module):
460 |     """Head for sentence-level classification tasks."""
461 | 
462 |     def __init__(self, config):
463 |         super(RobertaClassificationHead, self).__init__()
464 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
465 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
466 |         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
467 | 
468 |     def forward(self, features, **kwargs):
469 |         x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
470 |         x = self.dropout(x)
471 |         x = self.dense(x)
472 |         x = torch.tanh(x)
473 |         x = self.dropout(x)
474 |         x = self.out_proj(x)
475 |         return x
476 | 


--------------------------------------------------------------------------------
/run_glue.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
 17 | 
 18 | from __future__ import absolute_import, division, print_function
 19 | from sklearn.metrics import roc_curve, auc
 20 | import math
 21 | import argparse
 22 | import glob
 23 | import logging
 24 | import os
 25 | import random
 26 | 
 27 | import numpy as np
 28 | import torch
 29 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 30 |                               TensorDataset)
 31 | from torch.utils.data.distributed import DistributedSampler
 32 | 
 33 | try:
 34 |     from torch.utils.tensorboard import SummaryWriter
 35 | except:
 36 |     from tensorboardX import SummaryWriter
 37 | 
 38 | from tqdm import tqdm, trange
 39 | from file_utils1 import WEIGHTS_NAME
 40 | from configuration_bert1 import  BertConfig
 41 | from modeling_bert1 import  BertForMultiSequenceClassification
 42 | from tokenization_bert1 import  BertTokenizer
 43 | from configuration_roberta1 import  RobertaConfig
 44 | from modeling_roberta1 import RobertaForSequenceClassification
 45 | from tokenization_roberta1 import  RobertaTokenizer
 46 | # from transformers import (WEIGHTS_NAME, BertConfig,
 47 | #                           BertForSequenceClassification, BertTokenizer,
 48 | #                           RobertaConfig,
 49 | #                           RobertaForSequenceClassification,
 50 | #                           RobertaTokenizer,
 51 | #                           XLMConfig, XLMForSequenceClassification,
 52 | #                           XLMTokenizer, XLNetConfig,
 53 | #                           XLNetForSequenceClassification,
 54 | #                           XLNetTokenizer,
 55 | #                           DistilBertConfig,
 56 | #                           DistilBertForSequenceClassification,
 57 | #                           DistilBertTokenizer)
 58 | from optimization1 import AdamW,WarmupLinearSchedule
 59 | # from transformers import AdamW, WarmupLinearSchedule
 60 | from metrics1 import glue_compute_metrics as  compute_metrics
 61 | # from transformers import glue_compute_metrics as compute_metrics
 62 | # from transformers import glue_output_modes as output_modes
 63 | from glue1 import glue_output_modes as output_modes
 64 | # from transformers import glue_processors as processors
 65 | from glue1 import glue_processors as processors
 66 | from glue1 import  glue_convert_examples_to_features as convert_examples_to_features
 67 | # from transformers import glue_convert_examples_to_features as convert_examples_to_features
 68 | def sigmoid(x):
 69 |     return 1. / (1 + np.exp(-x))
 70 | logger = logging.getLogger(__name__)
 71 | 
 72 | ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig,
 73 |                                                                                 RobertaConfig)), ())
 74 | 
 75 | MODEL_CLASSES = {
 76 |     'bert': (BertConfig, BertForMultiSequenceClassification, BertTokenizer),
 77 |     # 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
 78 |     # 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
 79 |     'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
 80 |     # 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
 81 | }
 82 | 
 83 | 
 84 | def set_seed(args):
 85 |     random.seed(args.seed)
 86 |     np.random.seed(args.seed)
 87 |     torch.manual_seed(args.seed)
 88 |     if args.n_gpu > 0:
 89 |         torch.cuda.manual_seed_all(args.seed)
 90 | 
 91 | # def softmax(inMatrix):
 92 | #     """
 93 | #     softmax计算公式函数
 94 | #     :param inMatrix: 矩阵数据
 95 | #     :return:
 96 | #     """
 97 | #     m,n = np.shape(inMatrix)  #得到m,n(行，列)
 98 | #     outMatrix = np.mat(np.zeros((m,n)))#mat生成数组
 99 | #     for i in range(m):
100 | #       soft_sum = 0
101 | #       for idx in range(0,n):
102 | #          outMatrix[i,idx] = math.exp(inMatrix[i,idx])  #求幂运算，取e为底的指数计算变成非负
103 | #          soft_sum +=outMatrix[i,idx]   #求和运算
104 | #       for idx in range(0,n):
105 | #          outMatrix[i,idx] = outMatrix[i,idx] /soft_sum #然后除以所有项之后进行归一化
106 | #     return outMatrix
107 | 
108 | def train(args, train_dataset, model, tokenizer):
109 |     """ Train the model """
110 |     if args.local_rank in [-1, 0]:
111 |         tb_writer = SummaryWriter()
112 | 
113 |     # args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
114 |     args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
115 |     train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
116 |     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
117 | 
118 |     if args.max_steps > 0:
119 |         t_total = args.max_steps
120 |         args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
121 |     else:
122 |         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
123 | 
124 |     # Prepare optimizer and schedule (linear warmup and decay)
125 |     no_decay = ['bias', 'LayerNorm.weight']
126 |     optimizer_grouped_parameters = [
127 |         {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
128 |          'weight_decay': args.weight_decay},
129 |         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
130 |     ]
131 |     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
132 |     scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.06, t_total=t_total)
133 |     if args.fp16:
134 |         try:
135 |             from apex import amp
136 |         except ImportError:
137 |             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
138 |         model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
139 | 
140 |     # multi-gpu training (should be after apex fp16 initialization)
141 |     if args.n_gpu > 1:
142 |         model = torch.nn.DataParallel(model)
143 | 
144 |     # Distributed training (should be after apex fp16 initialization)
145 |     if args.local_rank != -1:
146 |         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
147 |                                                           output_device=args.local_rank,
148 |                                                           find_unused_parameters=True)
149 | 
150 |     # Train!
151 |     logger.info("***** Running training *****")
152 |     logger.info("  Num examples = %d", len(train_dataset))
153 |     logger.info("  Num Epochs = %d", args.num_train_epochs)
154 |     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
155 |     logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
156 |                 args.train_batch_size * args.gradient_accumulation_steps * (
157 |                     torch.distributed.get_world_size() if args.local_rank != -1 else 1))
158 |     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
159 |     logger.info("  Total optimization steps = %d", t_total)
160 | 
161 |     global_step = 0
162 |     tr_loss, logging_loss = 0.0, 0.0
163 |     model.zero_grad()
164 |     train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
165 |     set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
166 | 
167 |     for _ in train_iterator:
168 |         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
169 |         for step, batch in enumerate(epoch_iterator):
170 |             model.train()
171 |             batch = tuple(t.to(args.device) for t in batch)
172 |             inputs = {'input_ids': batch[0],
173 |                       'attention_mask': batch[1],
174 |                       'labels': batch[3]}
175 |             if args.model_type != 'distilbert':
176 |                 inputs['token_type_ids'] = batch[2] if args.model_type in ['bert',
177 |                                                                            'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
178 |             outputs = model(**inputs)
179 |             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
180 | 
181 |             if args.n_gpu > 1:
182 |                 loss = loss.mean()  # mean() to average on multi-gpu parallel training
183 |             if args.gradient_accumulation_steps > 1:
184 |                 loss = loss / args.gradient_accumulation_steps
185 | 
186 |             if args.fp16:
187 |                 with amp.scale_loss(loss, optimizer) as scaled_loss:
188 |                     scaled_loss.backward()
189 |             else:
190 |                 loss.backward()
191 |             tr_loss += loss.item()
192 |             if (step + 1) % args.gradient_accumulation_steps == 0 and not args.tpu:
193 |                 if args.fp16:
194 |                     torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
195 |                 else:
196 |                     torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
197 | 
198 |                 optimizer.step()
199 |                 scheduler.step()  # Update learning rate schedule
200 |                 model.zero_grad()
201 |                 global_step += 1
202 | 
203 |                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
204 |                     # Log metrics
205 |                     if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
206 |                         results = evaluate(args, model, tokenizer)
207 |                         for key, value in results.items():
208 |                             tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
209 |                     tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
210 |                     tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step)
211 |                     logging_loss = tr_loss
212 | 
213 |                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
214 |                     # Save model checkpoint
215 |                     output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
216 |                     if not os.path.exists(output_dir):
217 |                         os.makedirs(output_dir)
218 |                     model_to_save = model.module if hasattr(model,
219 |                                                             'module') else model  # Take care of distributed/parallel training
220 |                     model_to_save.save_pretrained(output_dir)
221 |                     torch.save(args, os.path.join(output_dir, 'training_args.bin'))
222 |                     logger.info("Saving model checkpoint to %s", output_dir)
223 | 
224 |             if args.tpu:
225 |                 args.xla_model.optimizer_step(optimizer, barrier=True)
226 |                 model.zero_grad()
227 |                 global_step += 1
228 | 
229 |             if args.max_steps > 0 and global_step > args.max_steps:
230 |                 epoch_iterator.close()
231 |                 break
232 |         if args.max_steps > 0 and global_step > args.max_steps:
233 |             train_iterator.close()
234 |             break
235 |         eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
236 |         eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (
237 |         args.output_dir,)
238 | 
239 |     if args.local_rank in [-1, 0]:
240 |         tb_writer.close()
241 | 
242 |     return global_step, tr_loss / global_step
243 | 
244 | 
245 | def evaluate(args, model, tokenizer, prefix=""):
246 |     # Loop to handle MNLI double evaluation (matched, mis-matched)
247 |     eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
248 |     eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
249 | 
250 |     results = {}
251 |     for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
252 |         eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
253 | 
254 |         if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
255 |             os.makedirs(eval_output_dir)
256 | 
257 |         args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
258 |         # Note that DistributedSampler samples randomly
259 |         eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
260 |         eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
261 | 
262 |         # Eval!
263 |         logger.info("***** Running evaluation {} *****".format(prefix))
264 |         logger.info("  Num examples = %d", len(eval_dataset))
265 |         logger.info("  Batch size = %d", args.eval_batch_size)
266 |         eval_loss = 0.0
267 |         nb_eval_steps = 0
268 |         preds = None
269 |         out_label_ids = None
270 |         all_logits=[]
271 |         for batch in tqdm(eval_dataloader, desc="Evaluating"):
272 |             model.eval()
273 |             batch = tuple(t.to(args.device) for t in batch)
274 | 
275 |             with torch.no_grad():
276 |                 inputs = {'input_ids': batch[0],
277 |                           'attention_mask': batch[1],
278 |                           'labels': batch[3]}
279 |                 if args.model_type != 'distilbert':
280 |                     inputs['token_type_ids'] = batch[2] if args.model_type in ['bert',
281 |                                                                                'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
282 |                 outputs = model(**inputs)
283 |                 tmp_eval_loss, logits = outputs[:2]
284 |                 tem_logits=logits
285 |                 Array_logits=np.array((tem_logits.cpu()))
286 |                 all_logits.append(Array_logits[0].tolist())
287 |                 all_logits.append(Array_logits[1].tolist())
288 |                 all_logits.append(Array_logits[2].tolist())
289 |                 eval_loss += tmp_eval_loss.mean().item()
290 |             nb_eval_steps += 1
291 |             if preds is None:
292 |                 preds = logits.detach().cpu().numpy()
293 |                 out_label_ids = inputs['labels'].detach().cpu().numpy()
294 |             else:
295 |                 preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
296 |                 out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
297 |         # for x in range(10,98):
298 |         Q=0.9
299 |         tem_logits_NEW=[]
300 |         tem_list=[]
301 |         all_logits=np.array(all_logits)
302 |         for i in range(399):
303 |             for j in range(16):
304 |                 tem_list.append(sigmoid(all_logits[i][j]))
305 |             tem_logits_NEW.append(tem_list)
306 |             tem_list=[]
307 |         # tem_logits_NEW=np.array(tem_logits_NEW)
308 |         # tem_logits_NEW=softmax(tem_logits_NEW)
309 |         # tem_logits_NEW=tem_logits_NEW.getA().tolist()
310 | 
311 |         for i in range(399):
312 |             for j in range(16):
313 |                 if tem_logits_NEW[i][j] > Q:
314 |                     tem_logits_NEW[i][j]=1
315 |                 else: tem_logits_NEW[i][j]=0
316 |         tem_logits_NEW=np.array(tem_logits_NEW)
317 |         count=0
318 |         for i in range(399):
319 |             tem_1=tem_logits_NEW[i]
320 |             tem_2=out_label_ids[i]
321 |             w=0
322 |             k=0
323 |             z=0
324 |             for j in range(16):
325 |                 if tem_1[j]==1:
326 |                     w=w+1
327 |             for j in range(16):
328 |                 if tem_2[j]==1:
329 |                     k=k+1
330 |             for j in range(16):
331 |                 if tem_1[j]==1 and tem_2[j]==1:
332 |                     z=z+1
333 |             N = z*1.0/(w+0.00000001)
334 |             M = z*1.0/(k+0.00000001)
335 |             if N+M == 0:
336 |                H=0
337 |             else:
338 |                H = (2*N*M)/(N+M)
339 |             count=count+H
340 |         F1=count/399
341 |         print("***********************************************")
342 |         print(F1)
343 |         print("***********************************************")
344 | 
345 |         # all_logits = np.array(all_logits)
346 |         # eval_loss = eval_loss / nb_eval_steps
347 |         # fpr = dict()
348 |         # tpr = dict()
349 |         # roc_auc = dict()
350 |         # for i in range(16):
351 |         #     fpr[i], tpr[i], _ = roc_curve(out_label_ids[:, i], all_logits[:, i])
352 |         #     roc_auc[i] = auc(fpr[i], tpr[i])
353 |         # fpr["micro"], tpr["micro"], _ = roc_curve(out_label_ids.ravel(), all_logits.ravel())
354 |         # roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
355 |         # print(roc_auc)
356 |         H = {'F1': F1}
357 |         return H
358 |         # if args.output_mode == "MultiLabelclassification":
359 |         #     output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
360 |         #     with open(output_eval_file, "w") as writer:
361 |         #         logger.info("***** Eval results {} *****".format(prefix))
362 |         #         # for key in sorted(result.keys()):
363 |         #         logger.info("  %s ",  str(eval_loss))
364 |         #         writer.write("%s\n" % (str(eval_loss)))
365 |         #     return eval_loss
366 |     #     if args.output_mode == "classification":
367 |     #         preds = np.argmax(preds, axis=1)
368 |     #     elif args.output_mode == "regression":
369 |     #         preds = np.squeeze(preds)
370 |     #     result = compute_metrics(eval_task, preds, out_label_ids)
371 |     #     results.update(result)
372 |     #
373 |     #     output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
374 |     #     with open(output_eval_file, "w") as writer:
375 |     #         logger.info("***** Eval results {} *****".format(prefix))
376 |     #         for key in sorted(result.keys()):
377 |     #             logger.info("  %s = %s", key, str(result[key]))
378 |     #             writer.write("%s = %s\n" % (key, str(result[key])))
379 |     #
380 |     # return results
381 | 
382 | 
383 | def load_and_cache_examples(args, task, tokenizer, evaluate=False):
384 |     if args.local_rank not in [-1, 0] and not evaluate:
385 |         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
386 | 
387 |     processor = processors[task]()
388 |     output_mode = output_modes[task]
389 |     # Load data features from cache or dataset file
390 |     cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
391 |         'dev' if evaluate else 'train',
392 |         list(filter(None, args.model_name_or_path.split('/'))).pop(),
393 |         str(args.max_seq_length),
394 |         str(task)))
395 |     if os.path.exists(cached_features_file) and not args.overwrite_cache:
396 |         logger.info("Loading features from cached file %s", cached_features_file)
397 |         features = torch.load(cached_features_file)
398 |     else:
399 |         logger.info("Creating features from dataset file at %s", args.data_dir)
400 |         label_list = processor.get_labels()
401 |         if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
402 |             # HACK(label indices are swapped in RoBERTa pretrained model)
403 |             label_list[1], label_list[2] = label_list[2], label_list[1]
404 |         examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(
405 |             args.data_dir)
406 |         features = convert_examples_to_features(examples,
407 |                                                 tokenizer,
408 |                                                 label_list=label_list,
409 |                                                 max_length=args.max_seq_length,
410 |                                                 output_mode=output_mode,
411 |                                                 pad_on_left=bool(args.model_type in ['xlnet']),
412 |                                                 # pad on the left for xlnet
413 |                                                 pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
414 |                                                 pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
415 |                                                 )
416 |         if args.local_rank in [-1, 0]:
417 |             logger.info("Saving features into cached file %s", cached_features_file)
418 |             torch.save(features, cached_features_file)
419 | 
420 |     if args.local_rank == 0 and not evaluate:
421 |         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
422 | 
423 |     # Convert to Tensors and build dataset
424 |     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
425 |     all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
426 |     all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
427 |     if output_mode == "classification":
428 |         all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
429 |     elif output_mode == "regression":
430 |         all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
431 |     elif output_mode == "MultiLabelclassification":
432 |         all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
433 | 
434 |     dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
435 |     return dataset
436 | 
437 | 
438 | def main():
439 |     parser = argparse.ArgumentParser()
440 | 
441 |     ## Required parameters
442 |     parser.add_argument("--data_dir", default="/home/msqin/bert/bert1/data_mutil", type=str,
443 |                         help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
444 |     parser.add_argument("--model_type", default="bert", type=str,
445 |                         help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
446 |     parser.add_argument("--model_name_or_path", default="/home/msqin/bert/bert1/uncased_L-12_H-768_A-12/pytorch_model.bin", type=str,
447 |                         help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
448 |                             ALL_MODELS))
449 |     parser.add_argument("--task_name", default="multilabel", type=str,
450 |                         help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
451 |     parser.add_argument("--output_dir", default="/home/msqin/bert/bert1/tmp/new_output", type=str,
452 |                         help="The output directory where the model predictions and checkpoints will be written.")
453 | 
454 |     ## Other parameters
455 |     parser.add_argument("--config_name", default="/home/msqin/bert/bert1/uncased_L-12_H-768_A-12/bert_config.json", type=str,
456 |                         help="Pretrained config name or path if not the same as model_name")
457 |     parser.add_argument("--tokenizer_name", default="/home/msqin/bert/bert1/uncased_L-12_H-768_A-12/vocab.txt", type=str,
458 |                         help="Pretrained tokenizer name or path if not the same as model_name")
459 |     parser.add_argument("--cache_dir", default="", type=str,
460 |                         help="Where do you want to store the pre-trained models downloaded from s3")
461 |     parser.add_argument("--max_seq_length", default=32, type=int,
462 |                         help="The maximum total input sequence length after tokenization. Sequences longer "
463 |                              "than this will be truncated, sequences shorter will be padded.")
464 |     parser.add_argument("--do_train", default=True,
465 |                         help="Whether to run training.")
466 |     parser.add_argument("--do_eval", default=True,
467 |                         help="Whether to run eval on the dev set.")
468 |     parser.add_argument("--evaluate_during_training", default=True,
469 |                         help="Rul evaluation during training at each logging step.")
470 |     parser.add_argument("--do_lower_case", default=True,
471 |                         help="Set this flag if you are using an uncased model.")
472 | 
473 |     parser.add_argument("--per_gpu_train_batch_size", default=32,type=int,
474 |                         help="Batch size per GPU/CPU for training.")
475 |     parser.add_argument("--per_gpu_eval_batch_size", default=3, type=int,
476 |                         help="Batch size per GPU/CPU for evaluation.")
477 |     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
478 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")
479 |     parser.add_argument("--learning_rate", default=2e-5, type=float,
480 |                         help="The initial learning rate for Adam.")
481 |     parser.add_argument("--weight_decay", default=0.0, type=float,
482 |                         help="Weight deay if we apply some.")
483 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
484 |                         help="Epsilon for Adam optimizer.")
485 |     parser.add_argument("--max_grad_norm", default=1.0, type=float,
486 |                         help="Max gradient norm.")
487 |     parser.add_argument("--num_train_epochs", default=20.0, type=float,
488 |                         help="Total number of training epochs to perform.")
489 |     parser.add_argument("--max_steps", default=-1, type=int,
490 |                         help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
491 |     parser.add_argument("--warmup_steps", default=0, type=int,
492 |                         help="Linear warmup over warmup_steps.")
493 | 
494 |     parser.add_argument('--logging_steps', type=int, default=500,
495 |                         help="Log every X updates steps.")
496 |     parser.add_argument('--save_steps', type=int, default=500,
497 |                         help="Save checkpoint every X updates steps.")
498 |     parser.add_argument("--eval_all_checkpoints", default=True,
499 |                         help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
500 |     parser.add_argument("--no_cuda", action='store_true',
501 |                         help="Avoid using CUDA when available")
502 |     parser.add_argument('--overwrite_output_dir', action='store_true',
503 |                         help="Overwrite the content of the output directory")
504 |     parser.add_argument('--overwrite_cache', action='store_true',
505 |                         help="Overwrite the cached training and evaluation sets")
506 |     parser.add_argument('--seed', type=int, default=42,
507 |                         help="random seed for initialization")
508 | 
509 |     parser.add_argument('--tpu', action='store_true',
510 |                         help="Whether to run on the TPU defined in the environment variables")
511 |     parser.add_argument('--tpu_ip_address', type=str, default='',
512 |                         help="TPU IP address if none are set in the environment variables")
513 |     parser.add_argument('--tpu_name', type=str, default='',
514 |                         help="TPU name if none are set in the environment variables")
515 |     parser.add_argument('--xrt_tpu_config', type=str, default='',
516 |                         help="XRT TPU config if none are set in the environment variables")
517 | 
518 |     parser.add_argument('--fp16', action='store_true',
519 |                         help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
520 |     parser.add_argument('--fp16_opt_level', type=str, default='O1',
521 |                         help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
522 |                              "See details at https://nvidia.github.io/apex/amp.html")
523 |     parser.add_argument("--local_rank", type=int, default=-1,
524 |                         help="For distributed training: local_rank")
525 |     parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
526 |     parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
527 |     args = parser.parse_args()
528 | 
529 |     if os.path.exists(args.output_dir) and os.listdir(
530 |             args.output_dir) and args.do_train and not args.overwrite_output_dir:
531 |         raise ValueError(
532 |             "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
533 |                 args.output_dir))
534 | 
535 |     # Setup distant debugging if needed
536 |     if args.server_ip and args.server_port:
537 |         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
538 |         import ptvsd
539 |         print("Waiting for debugger attach")
540 |         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
541 |         ptvsd.wait_for_attach()
542 | 
543 |     # Setup CUDA, GPU & distributed training
544 |     if args.local_rank == -1 or args.no_cuda:
545 |         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
546 |         args.n_gpu = torch.cuda.device_count()
547 |     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
548 |         torch.cuda.set_device(args.local_rank)
549 |         device = torch.device("cuda", args.local_rank)
550 |         torch.distributed.init_process_group(backend='nccl')
551 |         args.n_gpu = 1
552 |     args.device = device
553 | 
554 |     if args.tpu:
555 |         if args.tpu_ip_address:
556 |             os.environ["TPU_IP_ADDRESS"] = args.tpu_ip_address
557 |         if args.tpu_name:
558 |             os.environ["TPU_NAME"] = args.tpu_name
559 |         if args.xrt_tpu_config:
560 |             os.environ["XRT_TPU_CONFIG"] = args.xrt_tpu_config
561 | 
562 |         assert "TPU_IP_ADDRESS" in os.environ
563 |         assert "TPU_NAME" in os.environ
564 |         assert "XRT_TPU_CONFIG" in os.environ
565 | 
566 |         import torch_xla
567 |         import torch_xla.core.xla_model as xm
568 |         args.device = xm.xla_device()
569 |         args.xla_model = xm
570 | 
571 |     # Setup logging
572 |     logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
573 |                         datefmt='%m/%d/%Y %H:%M:%S',
574 |                         level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
575 |     logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
576 |                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
577 | 
578 |     # Set seed
579 |     set_seed(args)
580 | 
581 |     # Prepare GLUE task
582 |     args.task_name = args.task_name.lower()
583 |     if args.task_name not in processors:
584 |         raise ValueError("Task not found: %s" % (args.task_name))
585 |     processor = processors[args.task_name]()
586 |     args.output_mode = output_modes[args.task_name]
587 |     label_list = processor.get_labels()
588 |     num_labels = len(label_list)
589 | 
590 |     # Load pretrained model and tokenizer
591 |     if args.local_rank not in [-1, 0]:
592 |         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
593 | 
594 |     args.model_type = args.model_type.lower()
595 |     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
596 |     config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
597 |                                           num_labels=num_labels, finetuning_task=args.task_name)
598 |     tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
599 |                                                 do_lower_case=args.do_lower_case)
600 |     model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path),
601 |                                         config=config)
602 | 
603 |     if args.local_rank == 0:
604 |         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
605 | 
606 |     model.to(args.device)
607 | 
608 |     logger.info("Training/evaluation parameters %s", args)
609 | 
610 |     # Training
611 |     if args.do_train:
612 |         train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
613 |         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
614 |         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
615 | 
616 |     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
617 |     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and not args.tpu:
618 |         # Create output directory if needed
619 |         if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
620 |             os.makedirs(args.output_dir)
621 | 
622 |         logger.info("Saving model checkpoint to %s", args.output_dir)
623 |         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
624 |         # They can then be reloaded using `from_pretrained()`
625 |         model_to_save = model.module if hasattr(model,
626 |                                                 'module') else model  # Take care of distributed/parallel training
627 |         model_to_save.save_pretrained(args.output_dir)
628 |         tokenizer.save_pretrained(args.output_dir)
629 | 
630 |         # Good practice: save your training arguments together with the trained model
631 |         torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
632 | 
633 |         # Load a trained model and vocabulary that you have fine-tuned
634 |         model = model_class.from_pretrained(args.output_dir)
635 |         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
636 |         model.to(args.device)
637 | 
638 |     # Evaluation
639 |     results = {}
640 |     if args.do_eval and args.local_rank in [-1, 0]:
641 |         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
642 |         checkpoints = [args.output_dir]
643 |         if args.eval_all_checkpoints:
644 |             checkpoints = list(
645 |                 os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
646 |             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
647 |         logger.info("Evaluate the following checkpoints: %s", checkpoints)
648 |         for checkpoint in checkpoints:
649 |          # checkpoint="/home/msqin/bert/bert1/tmp/My_output"
650 |          global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
651 |          prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
652 | 
653 |          model = model_class.from_pretrained(checkpoint)
654 |          model.to(args.device)
655 |          result = evaluate(args, model, tokenizer, prefix=prefix)
656 |          result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
657 |          results.update(result)
658 |     print(results)
659 |     return results
660 | 
661 | 
662 | if __name__ == "__main__":
663 |     main()


--------------------------------------------------------------------------------
/modeling_utils1.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """PyTorch BERT model."""
 17 | 
 18 | from __future__ import (absolute_import, division, print_function,
 19 |                         unicode_literals)
 20 | 
 21 | import copy
 22 | import json
 23 | import logging
 24 | import os
 25 | from io import open
 26 | 
 27 | import six
 28 | import torch
 29 | from torch import nn
 30 | from torch.nn import CrossEntropyLoss
 31 | from torch.nn import functional as F
 32 | 
 33 | from configuration_utils1 import PretrainedConfig
 34 | from file_utils1 import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
 35 | 
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | 
 39 | try:
 40 |     from torch.nn import Identity
 41 | except ImportError:
 42 |     # Older PyTorch compatibility
 43 |     class Identity(nn.Module):
 44 |         r"""A placeholder identity operator that is argument-insensitive.
 45 |         """
 46 |         def __init__(self, *args, **kwargs):
 47 |             super(Identity, self).__init__()
 48 | 
 49 |         def forward(self, input):
 50 |             return input
 51 | 
 52 | class PreTrainedModel(nn.Module):
 53 |     r""" Base class for all models.
 54 | 
 55 |         :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
 56 |         as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
 57 | 
 58 |         Class attributes (overridden by derived classes):
 59 |             - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
 60 |             - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
 61 |             - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
 62 | 
 63 |                 - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
 64 |                 - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
 65 |                 - ``path``: a path (string) to the TensorFlow checkpoint.
 66 | 
 67 |             - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
 68 |     """
 69 |     config_class = None
 70 |     pretrained_model_archive_map = {}
 71 |     load_tf_weights = lambda model, config, path: None
 72 |     base_model_prefix = ""
 73 | 
 74 |     def __init__(self, config, *inputs, **kwargs):
 75 |         super(PreTrainedModel, self).__init__()
 76 |         if not isinstance(config, PretrainedConfig):
 77 |             raise ValueError(
 78 |                 "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
 79 |                 "To create a model from a pretrained model use "
 80 |                 "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
 81 |                     self.__class__.__name__, self.__class__.__name__
 82 |                 ))
 83 |         # Save config in model
 84 |         self.config = config
 85 | 
 86 |     def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
 87 |         """ Build a resized Embedding Module from a provided token Embedding Module.
 88 |             Increasing the size will add newly initialized vectors at the end
 89 |             Reducing the size will remove vectors from the end
 90 | 
 91 |         Args:
 92 |             new_num_tokens: (`optional`) int
 93 |                 New number of tokens in the embedding matrix.
 94 |                 Increasing the size will add newly initialized vectors at the end
 95 |                 Reducing the size will remove vectors from the end
 96 |                 If not provided or None: return the provided token Embedding Module.
 97 |         Return: ``torch.nn.Embeddings``
 98 |             Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
 99 |         """
100 |         if new_num_tokens is None:
101 |             return old_embeddings
102 | 
103 |         old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
104 |         if old_num_tokens == new_num_tokens:
105 |             return old_embeddings
106 | 
107 |         # Build new embeddings
108 |         new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
109 |         new_embeddings.to(old_embeddings.weight.device)
110 | 
111 |         # initialize all new embeddings (in particular added tokens)
112 |         self._init_weights(new_embeddings)
113 | 
114 |         # Copy word embeddings from the previous weights
115 |         num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
116 |         new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
117 | 
118 |         return new_embeddings
119 | 
120 |     def _tie_or_clone_weights(self, first_module, second_module):
121 |         """ Tie or clone module weights depending of weither we are using TorchScript or not
122 |         """
123 |         if self.config.torchscript:
124 |             first_module.weight = nn.Parameter(second_module.weight.clone())
125 |         else:
126 |             first_module.weight = second_module.weight
127 | 
128 |         if hasattr(first_module, 'bias') and first_module.bias is not None:
129 |             first_module.bias.data = torch.nn.functional.pad(
130 |                 first_module.bias.data,
131 |                 (0, first_module.weight.shape[0] - first_module.bias.shape[0]),
132 |                 'constant',
133 |                 0
134 |             )
135 | 
136 |     def resize_token_embeddings(self, new_num_tokens=None):
137 |         """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
138 |         Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
139 | 
140 |         Arguments:
141 | 
142 |             new_num_tokens: (`optional`) int:
143 |                 New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
144 |                 If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
145 | 
146 |         Return: ``torch.nn.Embeddings``
147 |             Pointer to the input tokens Embeddings Module of the model
148 |         """
149 |         base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
150 |         model_embeds = base_model._resize_token_embeddings(new_num_tokens)
151 |         if new_num_tokens is None:
152 |             return model_embeds
153 | 
154 |         # Update base model and current model config
155 |         self.config.vocab_size = new_num_tokens
156 |         base_model.vocab_size = new_num_tokens
157 | 
158 |         # Tie weights again if needed
159 |         if hasattr(self, 'tie_weights'):
160 |             self.tie_weights()
161 | 
162 |         return model_embeds
163 | 
164 |     def init_weights(self):
165 |         """ Initialize and prunes weights if needed. """
166 |         # Initialize weights
167 |         self.apply(self._init_weights)
168 | 
169 |         # Prune heads if needed
170 |         if self.config.pruned_heads:
171 |             self.prune_heads(self.config.pruned_heads)
172 | 
173 |     def prune_heads(self, heads_to_prune):
174 |         """ Prunes heads of the base model.
175 | 
176 |             Arguments:
177 | 
178 |                 heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
179 |                 E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
180 |         """
181 |         base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
182 | 
183 |         # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
184 |         for layer, heads in heads_to_prune.items():
185 |             union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
186 |             self.config.pruned_heads[layer] = list(union_heads)  # Unfortunately we have to store it as list for JSON
187 | 
188 |         base_model._prune_heads(heads_to_prune)
189 | 
190 |     def save_pretrained(self, save_directory):
191 |         """ Save a model and its configuration file to a directory, so that it
192 |             can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
193 |         """
194 |         assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
195 | 
196 |         # Only save the model it-self if we are using distributed training
197 |         model_to_save = self.module if hasattr(self, 'module') else self
198 | 
199 |         # Save configuration file
200 |         model_to_save.config.save_pretrained(save_directory)
201 | 
202 |         # If we save using the predefined names, we can load using `from_pretrained`
203 |         output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
204 |         torch.save(model_to_save.state_dict(), output_model_file)
205 |         logger.info("Model weights saved in {}".format(output_model_file))
206 | 
207 |     @classmethod
208 |     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
209 |         r"""Instantiate a pretrained pytorch model from a pre-trained model configuration.
210 | 
211 |         The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
212 |         To train the model, you should first set it back in training mode with ``model.train()``
213 | 
214 |         The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
215 |         It is up to you to train those weights with a downstream fine-tuning task.
216 | 
217 |         The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.
218 | 
219 |         Parameters:
220 |             pretrained_model_name_or_path: either:
221 | 
222 |                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
223 |                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
224 |                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
225 |                 - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
226 | 
227 |             model_args: (`optional`) Sequence of positional arguments:
228 |                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
229 | 
230 |             config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
231 |                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
232 | 
233 |                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
234 |                 - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
235 |                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
236 | 
237 |             state_dict: (`optional`) dict:
238 |                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
239 |                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
240 |                 In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
241 | 
242 |             cache_dir: (`optional`) string:
243 |                 Path to a directory in which a downloaded pre-trained model
244 |                 configuration should be cached if the standard cache should not be used.
245 | 
246 |             force_download: (`optional`) boolean, default False:
247 |                 Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
248 | 
249 |             proxies: (`optional`) dict, default None:
250 |                 A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
251 |                 The proxies are used on each request.
252 | 
253 |             output_loading_info: (`optional`) boolean:
254 |                 Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
255 | 
256 |             kwargs: (`optional`) Remaining dictionary of keyword arguments:
257 |                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
258 | 
259 |                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
260 |                 - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
261 | 
262 |         Examples::
263 | 
264 |             model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
265 |             model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
266 |             model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
267 |             assert model.config.output_attention == True
268 |             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
269 |             config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
270 |             model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
271 | 
272 |         """
273 |         config = kwargs.pop('config', None)
274 |         state_dict = kwargs.pop('state_dict', None)
275 |         cache_dir = kwargs.pop('cache_dir', None)
276 |         from_tf = kwargs.pop('from_tf', False)
277 |         force_download = kwargs.pop('force_download', False)
278 |         proxies = kwargs.pop('proxies', None)
279 |         output_loading_info = kwargs.pop('output_loading_info', False)
280 | 
281 |         # Load config
282 |         if config is None:
283 |             config, model_kwargs = cls.config_class.from_pretrained(
284 |                 pretrained_model_name_or_path, *model_args,
285 |                 cache_dir=cache_dir, return_unused_kwargs=True,
286 |                 force_download=force_download,
287 |                 **kwargs
288 |             )
289 |         else:
290 |             model_kwargs = kwargs
291 | 
292 |         # Load model
293 |         if pretrained_model_name_or_path is not None:
294 |             if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
295 |                 archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
296 |             elif os.path.isdir(pretrained_model_name_or_path):
297 |                 if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")):
298 |                     # Load from a TF 1.0 checkpoint
299 |                     archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
300 |                 elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
301 |                     # Load from a TF 2.0 checkpoint
302 |                     archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
303 |                 elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
304 |                     # Load from a PyTorch checkpoint
305 |                     archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
306 |                 else:
307 |                     raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
308 |                         [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
309 |                         pretrained_model_name_or_path))
310 |             elif os.path.isfile(pretrained_model_name_or_path):
311 |                 archive_file = pretrained_model_name_or_path
312 |             else:
313 |                 assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path)
314 |                 archive_file = pretrained_model_name_or_path + ".index"
315 | 
316 |             # redirect to the cache, if necessary
317 |             try:
318 |                 resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
319 |             except EnvironmentError:
320 |                 if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
321 |                     msg = "Couldn't reach server at '{}' to download pretrained weights.".format(
322 |                             archive_file)
323 |                 else:
324 |                     msg = "Model name '{}' was not found in model name list ({}). " \
325 |                         "We assumed '{}' was a path or url to model weight files named one of {} but " \
326 |                         "couldn't find any such file at this path or url.".format(
327 |                             pretrained_model_name_or_path,
328 |                             ', '.join(cls.pretrained_model_archive_map.keys()),
329 |                             archive_file,
330 |                             [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME])
331 |                 raise EnvironmentError(msg)
332 | 
333 |             if resolved_archive_file == archive_file:
334 |                 logger.info("loading weights file {}".format(archive_file))
335 |             else:
336 |                 logger.info("loading weights file {} from cache at {}".format(
337 |                     archive_file, resolved_archive_file))
338 |         else:
339 |             resolved_archive_file = None
340 | 
341 |         # Instantiate model.
342 |         model = cls(config, *model_args, **model_kwargs)
343 | 
344 |         if state_dict is None and not from_tf:
345 |             state_dict = torch.load(resolved_archive_file, map_location='cpu')
346 | 
347 |         missing_keys = []
348 |         unexpected_keys = []
349 |         error_msgs = []
350 | 
351 |         if from_tf:
352 |             if resolved_archive_file.endswith('.index'):
353 |                 # Load from a TensorFlow 1.X checkpoint - provided by original authors
354 |                 model = cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
355 |             else:
356 |                 # Load from our TensorFlow 2.0 checkpoints
357 |                 try:
358 |                     from transformers import load_tf2_checkpoint_in_pytorch_model
359 |                     model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True)
360 |                 except ImportError as e:
361 |                     logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
362 |                         "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
363 |                     raise e
364 |         else:
365 |             # Convert old format to new format if needed from a PyTorch state_dict
366 |             old_keys = []
367 |             new_keys = []
368 |             for key in state_dict.keys():
369 |                 new_key = None
370 |                 if 'gamma' in key:
371 |                     new_key = key.replace('gamma', 'weight')
372 |                 if 'beta' in key:
373 |                     new_key = key.replace('beta', 'bias')
374 |                 if new_key:
375 |                     old_keys.append(key)
376 |                     new_keys.append(new_key)
377 |             for old_key, new_key in zip(old_keys, new_keys):
378 |                 state_dict[new_key] = state_dict.pop(old_key)
379 | 
380 |             # copy state_dict so _load_from_state_dict can modify it
381 |             metadata = getattr(state_dict, '_metadata', None)
382 |             state_dict = state_dict.copy()
383 |             if metadata is not None:
384 |                 state_dict._metadata = metadata
385 | 
386 |             def load(module, prefix=''):
387 |                 local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
388 |                 module._load_from_state_dict(
389 |                     state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
390 |                 for name, child in module._modules.items():
391 |                     if child is not None:
392 |                         load(child, prefix + name + '.')
393 | 
394 |             # Make sure we are able to load base models as well as derived models (with heads)
395 |             start_prefix = ''
396 |             model_to_load = model
397 |             if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
398 |                 start_prefix = cls.base_model_prefix + '.'
399 |             if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
400 |                 model_to_load = getattr(model, cls.base_model_prefix)
401 | 
402 |             load(model_to_load, prefix=start_prefix)
403 |             if len(missing_keys) > 0:
404 |                 logger.info("Weights of {} not initialized from pretrained model: {}".format(
405 |                     model.__class__.__name__, missing_keys))
406 |             if len(unexpected_keys) > 0:
407 |                 logger.info("Weights from pretrained model not used in {}: {}".format(
408 |                     model.__class__.__name__, unexpected_keys))
409 |             if len(error_msgs) > 0:
410 |                 raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
411 |                                 model.__class__.__name__, "\n\t".join(error_msgs)))
412 | 
413 |         if hasattr(model, 'tie_weights'):
414 |             model.tie_weights()  # make sure word embedding weights are still tied
415 | 
416 |         # Set model in evaluation mode to desactivate DropOut modules by default
417 |         model.eval()
418 | 
419 |         if output_loading_info:
420 |             loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs}
421 |             return model, loading_info
422 | 
423 |         return model
424 | 
425 | 
426 | class Conv1D(nn.Module):
427 |     def __init__(self, nf, nx):
428 |         """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
429 |             Basically works like a Linear layer but the weights are transposed
430 |         """
431 |         super(Conv1D, self).__init__()
432 |         self.nf = nf
433 |         w = torch.empty(nx, nf)
434 |         nn.init.normal_(w, std=0.02)
435 |         self.weight = nn.Parameter(w)
436 |         self.bias = nn.Parameter(torch.zeros(nf))
437 | 
438 |     def forward(self, x):
439 |         size_out = x.size()[:-1] + (self.nf,)
440 |         x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
441 |         x = x.view(*size_out)
442 |         return x
443 | 
444 | 
445 | class PoolerStartLogits(nn.Module):
446 |     """ Compute SQuAD start_logits from sequence hidden states. """
447 |     def __init__(self, config):
448 |         super(PoolerStartLogits, self).__init__()
449 |         self.dense = nn.Linear(config.hidden_size, 1)
450 | 
451 |     def forward(self, hidden_states, p_mask=None):
452 |         """ Args:
453 |             **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)`
454 |                 invalid position mask such as query and special symbols (PAD, SEP, CLS)
455 |                 1.0 means token should be masked.
456 |         """
457 |         x = self.dense(hidden_states).squeeze(-1)
458 | 
459 |         if p_mask is not None:
460 |             if next(self.parameters()).dtype == torch.float16:
461 |                 x = x * (1 - p_mask) - 65500 * p_mask
462 |             else:
463 |                 x = x * (1 - p_mask) - 1e30 * p_mask
464 | 
465 |         return x
466 | 
467 | 
468 | class PoolerEndLogits(nn.Module):
469 |     """ Compute SQuAD end_logits from sequence hidden states and start token hidden state.
470 |     """
471 |     def __init__(self, config):
472 |         super(PoolerEndLogits, self).__init__()
473 |         self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
474 |         self.activation = nn.Tanh()
475 |         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
476 |         self.dense_1 = nn.Linear(config.hidden_size, 1)
477 | 
478 |     def forward(self, hidden_states, start_states=None, start_positions=None, p_mask=None):
479 |         """ Args:
480 |             One of ``start_states``, ``start_positions`` should be not None.
481 |             If both are set, ``start_positions`` overrides ``start_states``.
482 | 
483 |             **start_states**: ``torch.LongTensor`` of shape identical to hidden_states
484 |                 hidden states of the first tokens for the labeled span.
485 |             **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
486 |                 position of the first token for the labeled span:
487 |             **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
488 |                 Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
489 |                 1.0 means token should be masked.
490 |         """
491 |         assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
492 |         if start_positions is not None:
493 |             slen, hsz = hidden_states.shape[-2:]
494 |             start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
495 |             start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz)
496 |             start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz)
497 | 
498 |         x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
499 |         x = self.activation(x)
500 |         x = self.LayerNorm(x)
501 |         x = self.dense_1(x).squeeze(-1)
502 | 
503 |         if p_mask is not None:
504 |             if next(self.parameters()).dtype == torch.float16:
505 |                 x = x * (1 - p_mask) - 65500 * p_mask
506 |             else:
507 |                 x = x * (1 - p_mask) - 1e30 * p_mask
508 | 
509 |         return x
510 | 
511 | 
512 | class PoolerAnswerClass(nn.Module):
513 |     """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
514 |     def __init__(self, config):
515 |         super(PoolerAnswerClass, self).__init__()
516 |         self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
517 |         self.activation = nn.Tanh()
518 |         self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
519 | 
520 |     def forward(self, hidden_states, start_states=None, start_positions=None, cls_index=None):
521 |         """
522 |         Args:
523 |             One of ``start_states``, ``start_positions`` should be not None.
524 |             If both are set, ``start_positions`` overrides ``start_states``.
525 | 
526 |             **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``.
527 |                 hidden states of the first tokens for the labeled span.
528 |             **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
529 |                 position of the first token for the labeled span.
530 |             **cls_index**: torch.LongTensor of shape ``(batch_size,)``
531 |                 position of the CLS token. If None, take the last token.
532 | 
533 |             note(Original repo):
534 |                 no dependency on end_feature so that we can obtain one single `cls_logits`
535 |                 for each sample
536 |         """
537 |         hsz = hidden_states.shape[-1]
538 |         assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
539 |         if start_positions is not None:
540 |             start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
541 |             start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz)
542 | 
543 |         if cls_index is not None:
544 |             cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
545 |             cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz)
546 |         else:
547 |             cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz)
548 | 
549 |         x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
550 |         x = self.activation(x)
551 |         x = self.dense_1(x).squeeze(-1)
552 | 
553 |         return x
554 | 
555 | 
556 | class SQuADHead(nn.Module):
557 |     r""" A SQuAD head inspired by XLNet.
558 | 
559 |     Parameters:
560 |         config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
561 | 
562 |     Inputs:
563 |         **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
564 |             hidden states of sequence tokens
565 |         **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
566 |             position of the first token for the labeled span.
567 |         **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
568 |             position of the last token for the labeled span.
569 |         **cls_index**: torch.LongTensor of shape ``(batch_size,)``
570 |             position of the CLS token. If None, take the last token.
571 |         **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)``
572 |             Whether the question has a possible answer in the paragraph or not.
573 |         **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
574 |             Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
575 |             1.0 means token should be masked.
576 | 
577 |     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
578 |         **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
579 |             Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
580 |         **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
581 |             ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``
582 |             Log probabilities for the top config.start_n_top start token possibilities (beam-search).
583 |         **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
584 |             ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
585 |             Indices for the top config.start_n_top start token possibilities (beam-search).
586 |         **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
587 |             ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
588 |             Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
589 |         **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
590 |             ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
591 |             Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
592 |         **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
593 |             ``torch.FloatTensor`` of shape ``(batch_size,)``
594 |             Log probabilities for the ``is_impossible`` label of the answers.
595 |     """
596 |     def __init__(self, config):
597 |         super(SQuADHead, self).__init__()
598 |         self.start_n_top = config.start_n_top
599 |         self.end_n_top = config.end_n_top
600 | 
601 |         self.start_logits = PoolerStartLogits(config)
602 |         self.end_logits = PoolerEndLogits(config)
603 |         self.answer_class = PoolerAnswerClass(config)
604 | 
605 |     def forward(self, hidden_states, start_positions=None, end_positions=None,
606 |                 cls_index=None, is_impossible=None, p_mask=None):
607 |         outputs = ()
608 | 
609 |         start_logits = self.start_logits(hidden_states, p_mask=p_mask)
610 | 
611 |         if start_positions is not None and end_positions is not None:
612 |             # If we are on multi-GPU, let's remove the dimension added by batch splitting
613 |             for x in (start_positions, end_positions, cls_index, is_impossible):
614 |                 if x is not None and x.dim() > 1:
615 |                     x.squeeze_(-1)
616 | 
617 |             # during training, compute the end logits based on the ground truth of the start position
618 |             end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
619 | 
620 |             loss_fct = CrossEntropyLoss()
621 |             start_loss = loss_fct(start_logits, start_positions)
622 |             end_loss = loss_fct(end_logits, end_positions)
623 |             total_loss = (start_loss + end_loss) / 2
624 | 
625 |             if cls_index is not None and is_impossible is not None:
626 |                 # Predict answerability from the representation of CLS and START
627 |                 cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
628 |                 loss_fct_cls = nn.BCEWithLogitsLoss()
629 |                 cls_loss = loss_fct_cls(cls_logits, is_impossible)
630 | 
631 |                 # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
632 |                 total_loss += cls_loss * 0.5
633 | 
634 |             outputs = (total_loss,) + outputs
635 | 
636 |         else:
637 |             # during inference, compute the end logits based on beam search
638 |             bsz, slen, hsz = hidden_states.size()
639 |             start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
640 | 
641 |             start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
642 |             start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
643 |             start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
644 |             start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
645 | 
646 |             hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
647 |             p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
648 |             end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
649 |             end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
650 | 
651 |             end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
652 |             end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
653 |             end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
654 | 
655 |             start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
656 |             cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
657 | 
658 |             outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
659 | 
660 |         # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
661 |         # or (if labels are provided) (total_loss,)
662 |         return outputs
663 | 
664 | 
665 | class SequenceSummary(nn.Module):
666 |     r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
667 |         Args of the config class:
668 |             summary_type:
669 |                 - 'last' => [default] take the last token hidden state (like XLNet)
670 |                 - 'first' => take the first token hidden state (like Bert)
671 |                 - 'mean' => take the mean of all tokens hidden states
672 |                 - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
673 |                 - 'attn' => Not implemented now, use multi-head attention
674 |             summary_use_proj: Add a projection after the vector extraction
675 |             summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
676 |             summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
677 |             summary_first_dropout: Add a dropout before the projection and activation
678 |             summary_last_dropout: Add a dropout after the projection and activation
679 |     """
680 |     def __init__(self, config):
681 |         super(SequenceSummary, self).__init__()
682 | 
683 |         self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
684 |         if self.summary_type == 'attn':
685 |             # We should use a standard multi-head attention module with absolute positional embedding for that.
686 |             # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
687 |             # We can probably just use the multi-head attention module of PyTorch >=1.1.0
688 |             raise NotImplementedError
689 | 
690 |         self.summary = Identity()
691 |         if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
692 |             if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
693 |                 num_classes = config.num_labels
694 |             else:
695 |                 num_classes = config.hidden_size
696 |             self.summary = nn.Linear(config.hidden_size, num_classes)
697 | 
698 |         self.activation = Identity()
699 |         if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
700 |             self.activation = nn.Tanh()
701 | 
702 |         self.first_dropout = Identity()
703 |         if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
704 |             self.first_dropout = nn.Dropout(config.summary_first_dropout)
705 | 
706 |         self.last_dropout = Identity()
707 |         if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
708 |             self.last_dropout = nn.Dropout(config.summary_last_dropout)
709 | 
710 |     def forward(self, hidden_states, cls_index=None):
711 |         """ hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer.
712 |             cls_index: [optional] position of the classification token if summary_type == 'cls_index',
713 |                 shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
714 |                 if summary_type == 'cls_index' and cls_index is None:
715 |                     we take the last token of the sequence as classification token
716 |         """
717 |         if self.summary_type == 'last':
718 |             output = hidden_states[:, -1]
719 |         elif self.summary_type == 'first':
720 |             output = hidden_states[:, 0]
721 |         elif self.summary_type == 'mean':
722 |             output = hidden_states.mean(dim=1)
723 |         elif self.summary_type == 'cls_index':
724 |             if cls_index is None:
725 |                 cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
726 |             else:
727 |                 cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
728 |                 cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
729 |             # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
730 |             output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
731 |         elif self.summary_type == 'attn':
732 |             raise NotImplementedError
733 | 
734 |         output = self.first_dropout(output)
735 |         output = self.summary(output)
736 |         output = self.activation(output)
737 |         output = self.last_dropout(output)
738 | 
739 |         return output
740 | 
741 | 
742 | def prune_linear_layer(layer, index, dim=0):
743 |     """ Prune a linear layer (a model parameters) to keep only entries in index.
744 |         Return the pruned layer as a new layer with requires_grad=True.
745 |         Used to remove heads.
746 |     """
747 |     index = index.to(layer.weight.device)
748 |     W = layer.weight.index_select(dim, index).clone().detach()
749 |     if layer.bias is not None:
750 |         if dim == 1:
751 |             b = layer.bias.clone().detach()
752 |         else:
753 |             b = layer.bias[index].clone().detach()
754 |     new_size = list(layer.weight.size())
755 |     new_size[dim] = len(index)
756 |     new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
757 |     new_layer.weight.requires_grad = False
758 |     new_layer.weight.copy_(W.contiguous())
759 |     new_layer.weight.requires_grad = True
760 |     if layer.bias is not None:
761 |         new_layer.bias.requires_grad = False
762 |         new_layer.bias.copy_(b.contiguous())
763 |         new_layer.bias.requires_grad = True
764 |     return new_layer
765 | 
766 | 
767 | def prune_conv1d_layer(layer, index, dim=1):
768 |     """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
769 |         A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
770 |         Return the pruned layer as a new layer with requires_grad=True.
771 |         Used to remove heads.
772 |     """
773 |     index = index.to(layer.weight.device)
774 |     W = layer.weight.index_select(dim, index).clone().detach()
775 |     if dim == 0:
776 |         b = layer.bias.clone().detach()
777 |     else:
778 |         b = layer.bias[index].clone().detach()
779 |     new_size = list(layer.weight.size())
780 |     new_size[dim] = len(index)
781 |     new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
782 |     new_layer.weight.requires_grad = False
783 |     new_layer.weight.copy_(W.contiguous())
784 |     new_layer.weight.requires_grad = True
785 |     new_layer.bias.requires_grad = False
786 |     new_layer.bias.copy_(b.contiguous())
787 |     new_layer.bias.requires_grad = True
788 |     return new_layer
789 | 
790 | 
791 | def prune_layer(layer, index, dim=None):
792 |     """ Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index.
793 |         Return the pruned layer as a new layer with requires_grad=True.
794 |         Used to remove heads.
795 |     """
796 |     if isinstance(layer, nn.Linear):
797 |         return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
798 |     elif isinstance(layer, Conv1D):
799 |         return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
800 |     else:
801 |         raise ValueError("Can't prune layer of class {}".format(layer.__class__))
802 | 


--------------------------------------------------------------------------------