├── data_mutil
├── data
├── READ
└── labels.txt
├── README.md
├── configuration_roberta1.py
├── metrics1.py
├── utils.py
├── tokenization_roberta1.py
├── configuration_bert1.py
├── optimization1.py
├── tokenization_gpt21.py
├── configuration_utils1.py
├── file_utils1.py
├── glue1.py
├── tokenization_bert1.py
├── modeling_roberta1.py
├── run_glue.py
└── modeling_utils1.py
/data_mutil:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/data/READ:
--------------------------------------------------------------------------------
1 | 这是我们的文本存储文件夹
2 |
--------------------------------------------------------------------------------
/data/labels.txt:
--------------------------------------------------------------------------------
1 | inform_theater;
2 | inform_starttime;
3 | inform_numberofpeople
4 | greeting;
5 | thanks
6 | inform_other
7 | request_moviename;
8 | inform_genre
9 | request_ticket;
10 | inform_city;
11 | inform_state;
12 | inform_date
13 | inform_moviename
14 | confirm_answer;
15 | inform_zip
16 | inform_video_format
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BERT 对多标签文本分类
2 | 调用bert-base-uncased的pytorch预训练模型做对一个多标签文本做分类
3 |
4 |
5 | 文件夹下包含子文件夹,其中data是我们文本存储文件夹,包含训练、验证和测试的数据。
6 | 对数据的描述:每个实例包含一句话和它所对应的标签,标签总共包含16个。具体标签内容在data文件夹下的label.txt文件中。
7 | 首先我们需要下载transformer包,利用pip install transformer或者conda命令即可安装。
8 | 然后我们需要打开run_glue.py文件,修改文件路径,将文本文件夹路径(data_dir)、模型(bert_base_uncased)路径进行修改。其中模型路径包含三部分内容,第一部分是uncased-model.bin,也就是pytorch版本模型路径。第二部分是模型所对应的json配置文件路径。第三部分就是vocab.txt路径,需要一一在run_glue.py进行修改。
9 |
10 | 模型相关的三个文件需要自己下载!
11 | debug run_glue.py文件 MODEL_ALL中包含三个文件的下载地址!
12 |
--------------------------------------------------------------------------------
/configuration_roberta1.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 |
18 | from __future__ import (absolute_import, division, print_function,
19 | unicode_literals)
20 |
21 | import logging
22 |
23 | from configuration_bert1 import BertConfig
24 |
25 | logger = logging.getLogger(__name__)
26 |
27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
29 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
30 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
31 | }
32 |
33 |
34 | class RobertaConfig(BertConfig):
35 | pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
36 |
--------------------------------------------------------------------------------
/metrics1.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import csv
18 | import sys
19 | import logging
20 |
21 | logger = logging.getLogger(__name__)
22 |
23 | try:
24 | from scipy.stats import pearsonr, spearmanr
25 | from sklearn.metrics import matthews_corrcoef, f1_score
26 | _has_sklearn = True
27 | except (AttributeError, ImportError) as e:
28 | logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")
29 | _has_sklearn = False
30 |
31 | def is_sklearn_available():
32 | return _has_sklearn
33 |
34 | if _has_sklearn:
35 |
36 | def simple_accuracy(preds, labels):
37 | return (preds == labels).mean()
38 |
39 |
40 | def acc_and_f1(preds, labels):
41 | acc = simple_accuracy(preds, labels)
42 | f1 = f1_score(y_true=labels, y_pred=preds)
43 | return {
44 | "acc": acc,
45 | "f1": f1,
46 | "acc_and_f1": (acc + f1) / 2,
47 | }
48 |
49 |
50 | def pearson_and_spearman(preds, labels):
51 | pearson_corr = pearsonr(preds, labels)[0]
52 | spearman_corr = spearmanr(preds, labels)[0]
53 | return {
54 | "pearson": pearson_corr,
55 | "spearmanr": spearman_corr,
56 | "corr": (pearson_corr + spearman_corr) / 2,
57 | }
58 |
59 |
60 | def glue_compute_metrics(task_name, preds, labels):
61 | assert len(preds) == len(labels)
62 | if task_name == "cola":
63 | return {"mcc": matthews_corrcoef(labels, preds)}
64 | elif task_name == "sst-2":
65 | return {"acc": simple_accuracy(preds, labels)}
66 | elif task_name == "mrpc":
67 | return acc_and_f1(preds, labels)
68 | elif task_name == "sts-b":
69 | return pearson_and_spearman(preds, labels)
70 | elif task_name == "qqp":
71 | return acc_and_f1(preds, labels)
72 | elif task_name == "mnli":
73 | return {"acc": simple_accuracy(preds, labels)}
74 | elif task_name == "mnli-mm":
75 | return {"acc": simple_accuracy(preds, labels)}
76 | elif task_name == "qnli":
77 | return {"acc": simple_accuracy(preds, labels)}
78 | elif task_name == "rte":
79 | return {"acc": simple_accuracy(preds, labels)}
80 | elif task_name == "wnli":
81 | return {"acc": simple_accuracy(preds, labels)}
82 | elif task_name == "multilabel":
83 | return {"acc": simple_accuracy(preds, labels)}
84 | else:
85 | raise KeyError(task_name)
86 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import csv
18 | import sys
19 | import copy
20 | import json
21 |
22 | class InputExample(object):
23 | """
24 | A single training/test example for simple sequence classification.
25 |
26 | Args:
27 | guid: Unique id for the example.
28 | text_a: string. The untokenized text of the first sequence. For single
29 | sequence tasks, only this sequence must be specified.
30 | text_b: (Optional) string. The untokenized text of the second sequence.
31 | Only must be specified for sequence pair tasks.
32 | label: (Optional) string. The label of the example. This should be
33 | specified for train and dev examples, but not for test examples.
34 | """
35 | def __init__(self, guid, text_a, text_b=None, label=None):
36 | self.guid = guid
37 | self.text_a = text_a
38 | self.text_b = text_b
39 | self.label = label
40 |
41 |
42 |
43 |
44 |
45 | def __repr__(self):
46 | return str(self.to_json_string())
47 |
48 | def to_dict(self):
49 | """Serializes this instance to a Python dictionary."""
50 | output = copy.deepcopy(self.__dict__)
51 | return output
52 |
53 | def to_json_string(self):
54 | """Serializes this instance to a JSON string."""
55 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
56 |
57 |
58 | class InputFeatures(object):
59 | """
60 | A single set of features of data.
61 |
62 | Args:
63 | input_ids: Indices of input sequence tokens in the vocabulary.
64 | attention_mask: Mask to avoid performing attention on padding token indices.
65 | Mask values selected in ``[0, 1]``:
66 | Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
67 | token_type_ids: Segment token indices to indicate first and second portions of the inputs.
68 | label: Label corresponding to the input
69 | """
70 |
71 | def __init__(self, input_ids, attention_mask, token_type_ids, label):
72 | self.input_ids = input_ids
73 | self.attention_mask = attention_mask
74 | self.token_type_ids = token_type_ids
75 | self.label = label
76 |
77 | def __repr__(self):
78 | return str(self.to_json_string())
79 |
80 | def to_dict(self):
81 | """Serializes this instance to a Python dictionary."""
82 | output = copy.deepcopy(self.__dict__)
83 | return output
84 |
85 | def to_json_string(self):
86 | """Serializes this instance to a JSON string."""
87 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
88 |
89 |
90 | class DataProcessor(object):
91 | """Base class for data converters for sequence classification data sets."""
92 |
93 | def get_example_from_tensor_dict(self, tensor_dict):
94 | """Gets an example from a dict with tensorflow tensors
95 |
96 | Args:
97 | tensor_dict: Keys and values should match the corresponding Glue
98 | tensorflow_dataset examples.
99 | """
100 | raise NotImplementedError()
101 |
102 | def get_train_examples(self, data_dir):
103 | """Gets a collection of `InputExample`s for the train set."""
104 | raise NotImplementedError()
105 |
106 | def get_dev_examples(self, data_dir):
107 | """Gets a collection of `InputExample`s for the dev set."""
108 | raise NotImplementedError()
109 |
110 | def get_labels(self):
111 | """Gets the list of labels for this data set."""
112 | raise NotImplementedError()
113 |
114 | @classmethod
115 | def _read_tsv(cls, input_file, quotechar=None):
116 | """Reads a tab separated value file."""
117 | with open(input_file, "r", encoding="utf-8-sig") as f:
118 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
119 | lines = []
120 | for line in reader:
121 | if sys.version_info[0] == 2:
122 | line = list(unicode(cell, 'utf-8') for cell in line)
123 | lines.append(line)
124 | return lines
125 |
--------------------------------------------------------------------------------
/tokenization_roberta1.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for RoBERTa."""
16 | from __future__ import (absolute_import, division, print_function,
17 | unicode_literals)
18 |
19 | import sys
20 | import json
21 | import logging
22 | import os
23 | import regex as re
24 | from io import open
25 |
26 | from tokenization_gpt21 import GPT2Tokenizer
27 |
28 | try:
29 | from functools import lru_cache
30 | except ImportError:
31 | # Just a dummy decorator to get the checks to run on python2
32 | # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
33 | def lru_cache():
34 | return lambda func: func
35 |
36 | logger = logging.getLogger(__name__)
37 |
38 | VOCAB_FILES_NAMES = {
39 | 'vocab_file': 'vocab.json',
40 | 'merges_file': 'merges.txt',
41 | }
42 |
43 | PRETRAINED_VOCAB_FILES_MAP = {
44 | 'vocab_file':
45 | {
46 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
47 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
48 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
49 | },
50 | 'merges_file':
51 | {
52 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
53 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
54 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
55 | },
56 | }
57 |
58 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
59 | 'roberta-base': 512,
60 | 'roberta-large': 512,
61 | 'roberta-large-mnli': 512,
62 | }
63 |
64 |
65 | class RobertaTokenizer(GPT2Tokenizer):
66 | """
67 | RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
68 | - Byte-level Byte-Pair-Encoding
69 | - Requires a space to start the input string => the encoding methods should be called with the
70 | ``add_prefix_space`` flag set to ``True``.
71 | Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
72 | the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
73 | """
74 | vocab_files_names = VOCAB_FILES_NAMES
75 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
76 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
77 |
78 | def __init__(self, vocab_file, merges_file, errors='replace', bos_token="", eos_token="", sep_token="",
79 | cls_token="", unk_token="", pad_token='', mask_token='', **kwargs):
80 | super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors,
81 | bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
82 | sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
83 | mask_token=mask_token, **kwargs)
84 | self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
85 | self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
86 |
87 | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
88 | """
89 | Build model inputs from a sequence or a pair of sequence for sequence classification tasks
90 | by concatenating and adding special tokens.
91 | A RoBERTa sequence has the following format:
92 | single sequence: X
93 | pair of sequences: A B
94 | """
95 | if token_ids_1 is None:
96 | return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
97 | cls = [self.cls_token_id]
98 | sep = [self.sep_token_id]
99 | return cls + token_ids_0 + sep + sep + token_ids_1 + sep
100 |
101 | def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
102 | """
103 | Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
104 | special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
105 |
106 | Args:
107 | token_ids_0: list of ids (must not contain special tokens)
108 | token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
109 | for sequence pairs
110 | already_has_special_tokens: (default False) Set to True if the token list is already formated with
111 | special tokens for the model
112 |
113 | Returns:
114 | A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
115 | """
116 | if already_has_special_tokens:
117 | if token_ids_1 is not None:
118 | raise ValueError("You should not supply a second sequence if the provided sequence of "
119 | "ids is already formated with special tokens for the model.")
120 | return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
121 |
122 | if token_ids_1 is None:
123 | return [1] + ([0] * len(token_ids_0)) + [1]
124 | return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
125 |
126 | def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
127 | """
128 | Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
129 | A RoBERTa sequence pair mask has the following format:
130 | 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
131 | | first sequence | second sequence
132 |
133 | if token_ids_1 is None, only returns the first portion of the mask (0's).
134 | """
135 | sep = [self.sep_token_id]
136 | cls = [self.cls_token_id]
137 |
138 | if token_ids_1 is None:
139 | return len(cls + token_ids_0 + sep) * [0]
140 | return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
141 |
--------------------------------------------------------------------------------
/configuration_bert1.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ BERT model configuration """
17 |
18 | from __future__ import absolute_import, division, print_function, unicode_literals
19 |
20 | import json
21 | import logging
22 | import sys
23 | from io import open
24 |
25 | from configuration_utils1 import PretrainedConfig
26 |
27 | logger = logging.getLogger(__name__)
28 |
29 | BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
30 | 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
31 | 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
32 | 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
33 | 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
34 | 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
35 | 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
36 | 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
37 | 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
38 | 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
39 | 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
40 | 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
41 | 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
42 | 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
43 | 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
44 | 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
45 | }
46 |
47 |
48 | class BertConfig(PretrainedConfig):
49 | r"""
50 | :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
51 | `BertModel`.
52 |
53 |
54 | Arguments:
55 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
56 | hidden_size: Size of the encoder layers and the pooler layer.
57 | num_hidden_layers: Number of hidden layers in the Transformer encoder.
58 | num_attention_heads: Number of attention heads for each attention layer in
59 | the Transformer encoder.
60 | intermediate_size: The size of the "intermediate" (i.e., feed-forward)
61 | layer in the Transformer encoder.
62 | hidden_act: The non-linear activation function (function or string) in the
63 | encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
64 | hidden_dropout_prob: The dropout probabilitiy for all fully connected
65 | layers in the embeddings, encoder, and pooler.
66 | attention_probs_dropout_prob: The dropout ratio for the attention
67 | probabilities.
68 | max_position_embeddings: The maximum sequence length that this model might
69 | ever be used with. Typically set this to something large just in case
70 | (e.g., 512 or 1024 or 2048).
71 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into
72 | `BertModel`.
73 | initializer_range: The sttdev of the truncated_normal_initializer for
74 | initializing all weight matrices.
75 | layer_norm_eps: The epsilon used by LayerNorm.
76 | """
77 | pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
78 |
79 | def __init__(self,
80 | vocab_size_or_config_json_file=30522,
81 | hidden_size=768,
82 | num_hidden_layers=12,
83 | num_attention_heads=12,
84 | intermediate_size=3072,
85 | hidden_act="gelu",
86 | hidden_dropout_prob=0.1,
87 | attention_probs_dropout_prob=0.1,
88 | max_position_embeddings=512,
89 | type_vocab_size=2,
90 | initializer_range=0.02,
91 | layer_norm_eps=1e-12,
92 | **kwargs):
93 | super(BertConfig, self).__init__(**kwargs)
94 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
95 | and isinstance(vocab_size_or_config_json_file, unicode)):
96 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
97 | json_config = json.loads(reader.read())
98 | for key, value in json_config.items():
99 | self.__dict__[key] = value
100 | elif isinstance(vocab_size_or_config_json_file, int):
101 | self.vocab_size = vocab_size_or_config_json_file
102 | self.hidden_size = hidden_size
103 | self.num_hidden_layers = num_hidden_layers
104 | self.num_attention_heads = num_attention_heads
105 | self.hidden_act = hidden_act
106 | self.intermediate_size = intermediate_size
107 | self.hidden_dropout_prob = hidden_dropout_prob
108 | self.attention_probs_dropout_prob = attention_probs_dropout_prob
109 | self.max_position_embeddings = max_position_embeddings
110 | self.type_vocab_size = type_vocab_size
111 | self.initializer_range = initializer_range
112 | self.layer_norm_eps = layer_norm_eps
113 | else:
114 | raise ValueError("First argument must be either a vocabulary size (int)"
115 | " or the path to a pretrained model config file (str)")
116 |
--------------------------------------------------------------------------------
/optimization1.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """PyTorch optimization for BERT model."""
16 |
17 | import logging
18 | import math
19 |
20 | import torch
21 | from torch.optim import Optimizer
22 | from torch.optim.lr_scheduler import LambdaLR
23 |
24 | logger = logging.getLogger(__name__)
25 |
26 | class ConstantLRSchedule(LambdaLR):
27 | """ Constant learning rate schedule.
28 | """
29 | def __init__(self, optimizer, last_epoch=-1):
30 | super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch)
31 |
32 |
33 | class WarmupConstantSchedule(LambdaLR):
34 | """ Linear warmup and then constant.
35 | Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps.
36 | Keeps learning rate schedule equal to 1. after warmup_steps.
37 | """
38 | def __init__(self, optimizer, warmup_steps, last_epoch=-1):
39 | self.warmup_steps = warmup_steps
40 | super(WarmupConstantSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
41 |
42 | def lr_lambda(self, step):
43 | if step < self.warmup_steps:
44 | return float(step) / float(max(1.0, self.warmup_steps))
45 | return 1.
46 |
47 |
48 | class WarmupLinearSchedule(LambdaLR):
49 | """ Linear warmup and then linear decay.
50 | Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
51 | Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps.
52 | """
53 | def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
54 | self.warmup_steps = warmup_steps
55 | self.t_total = t_total
56 | super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
57 |
58 | def lr_lambda(self, step):
59 | if step < self.warmup_steps:
60 | return float(step) / float(max(1, self.warmup_steps))
61 | return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))
62 |
63 |
64 | class WarmupCosineSchedule(LambdaLR):
65 | """ Linear warmup and then cosine decay.
66 | Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
67 | Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
68 | If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
69 | """
70 | def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
71 | self.warmup_steps = warmup_steps
72 | self.t_total = t_total
73 | self.cycles = cycles
74 | super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
75 |
76 | def lr_lambda(self, step):
77 | if step < self.warmup_steps:
78 | return float(step) / float(max(1.0, self.warmup_steps))
79 | # progress after warmup
80 | progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
81 | return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
82 |
83 |
84 | class WarmupCosineWithHardRestartsSchedule(LambdaLR):
85 | """ Linear warmup and then cosine cycles with hard restarts.
86 | Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
87 | If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
88 | learning rate (with hard restarts).
89 | """
90 | def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
91 | self.warmup_steps = warmup_steps
92 | self.t_total = t_total
93 | self.cycles = cycles
94 | super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
95 |
96 | def lr_lambda(self, step):
97 | if step < self.warmup_steps:
98 | return float(step) / float(max(1, self.warmup_steps))
99 | # progress after warmup
100 | progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
101 | if progress >= 1.0:
102 | return 0.0
103 | return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0))))
104 |
105 |
106 |
107 | class AdamW(Optimizer):
108 | """ Implements Adam algorithm with weight decay fix.
109 |
110 | Parameters:
111 | lr (float): learning rate. Default 1e-3.
112 | betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999)
113 | eps (float): Adams epsilon. Default: 1e-6
114 | weight_decay (float): Weight decay. Default: 0.0
115 | correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
116 | """
117 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
118 | if lr < 0.0:
119 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
120 | if not 0.0 <= betas[0] < 1.0:
121 | raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
122 | if not 0.0 <= betas[1] < 1.0:
123 | raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
124 | if not 0.0 <= eps:
125 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
126 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
127 | correct_bias=correct_bias)
128 | super(AdamW, self).__init__(params, defaults)
129 |
130 | def step(self, closure=None):
131 | """Performs a single optimization step.
132 |
133 | Arguments:
134 | closure (callable, optional): A closure that reevaluates the model
135 | and returns the loss.
136 | """
137 | loss = None
138 | if closure is not None:
139 | loss = closure()
140 |
141 | for group in self.param_groups:
142 | for p in group['params']:
143 | if p.grad is None:
144 | continue
145 | grad = p.grad.data
146 | if grad.is_sparse:
147 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
148 |
149 | state = self.state[p]
150 |
151 | # State initialization
152 | if len(state) == 0:
153 | state['step'] = 0
154 | # Exponential moving average of gradient values
155 | state['exp_avg'] = torch.zeros_like(p.data)
156 | # Exponential moving average of squared gradient values
157 | state['exp_avg_sq'] = torch.zeros_like(p.data)
158 |
159 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
160 | beta1, beta2 = group['betas']
161 |
162 | state['step'] += 1
163 |
164 | # Decay the first and second moment running average coefficient
165 | # In-place operations to update the averages at the same time
166 | exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
167 | exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
168 | denom = exp_avg_sq.sqrt().add_(group['eps'])
169 |
170 | step_size = group['lr']
171 | if group['correct_bias']: # No bias correction for Bert
172 | bias_correction1 = 1.0 - beta1 ** state['step']
173 | bias_correction2 = 1.0 - beta2 ** state['step']
174 | step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
175 |
176 | p.data.addcdiv_(-step_size, exp_avg, denom)
177 |
178 | # Just adding the square of the weights to the loss function is *not*
179 | # the correct way of using L2 regularization/weight decay with Adam,
180 | # since that will interact with the m and v parameters in strange ways.
181 | #
182 | # Instead we want to decay the weights in a manner that doesn't interact
183 | # with the m/v parameters. This is equivalent to adding the square
184 | # of the weights to the loss with plain (non-momentum) SGD.
185 | # Add weight decay at the end (fixed version)
186 | if group['weight_decay'] > 0.0:
187 | p.data.add_(-group['lr'] * group['weight_decay'], p.data)
188 |
189 | return loss
190 |
--------------------------------------------------------------------------------
/tokenization_gpt21.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for OpenAI GPT."""
16 | from __future__ import (absolute_import, division, print_function,
17 | unicode_literals)
18 |
19 | import sys
20 | import json
21 | import logging
22 | import os
23 | import regex as re
24 | from io import open
25 |
26 | try:
27 | from functools import lru_cache
28 | except ImportError:
29 | # Just a dummy decorator to get the checks to run on python2
30 | # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
31 | def lru_cache():
32 | return lambda func: func
33 |
34 | from tokenization_utils1 import PreTrainedTokenizer
35 |
36 | logger = logging.getLogger(__name__)
37 |
38 | VOCAB_FILES_NAMES = {
39 | 'vocab_file': 'vocab.json',
40 | 'merges_file': 'merges.txt',
41 | }
42 |
43 | PRETRAINED_VOCAB_FILES_MAP = {
44 | 'vocab_file':
45 | {
46 | 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
47 | 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
48 | 'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
49 | 'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
50 | },
51 | 'merges_file':
52 | {
53 | 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
54 | 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
55 | 'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
56 | 'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
57 | },
58 | }
59 |
60 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
61 | 'gpt2': 1024,
62 | 'gpt2-medium': 1024,
63 | 'gpt2-large': 1024,
64 | 'distilgpt2': 1024,
65 | }
66 |
67 |
68 | @lru_cache()
69 | def bytes_to_unicode():
70 | """
71 | Returns list of utf-8 byte and a mapping to unicode strings.
72 | We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
73 |
74 | The reversible bpe codes work on unicode strings.
75 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
76 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
77 | This is a signficant percentage of your normal, say, 32K bpe vocab.
78 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
79 | """
80 | _chr = unichr if sys.version_info[0] == 2 else chr
81 | bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
82 | cs = bs[:]
83 | n = 0
84 | for b in range(2 ** 8):
85 | if b not in bs:
86 | bs.append(b)
87 | cs.append(2 ** 8 + n)
88 | n += 1
89 | cs = [_chr(n) for n in cs]
90 | return dict(zip(bs, cs))
91 |
92 |
93 | def get_pairs(word):
94 | """Return set of symbol pairs in a word.
95 |
96 | Word is represented as tuple of symbols (symbols being variable-length strings).
97 | """
98 | pairs = set()
99 | prev_char = word[0]
100 | for char in word[1:]:
101 | pairs.add((prev_char, char))
102 | prev_char = char
103 | return pairs
104 |
105 |
106 | class GPT2Tokenizer(PreTrainedTokenizer):
107 | """
108 | GPT-2 BPE tokenizer. Peculiarities:
109 | - Byte-level Byte-Pair-Encoding
110 | - Requires a space to start the input string => the encoding methods should be called with the
111 | ``add_prefix_space`` flag set to ``True``.
112 | Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
113 | the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
114 | """
115 | vocab_files_names = VOCAB_FILES_NAMES
116 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
117 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
118 |
119 | def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
120 | bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
121 | super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
122 | self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
123 | self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
124 |
125 | self.encoder = json.load(open(vocab_file, encoding="utf-8"))
126 | self.decoder = {v: k for k, v in self.encoder.items()}
127 | self.errors = errors # how to handle errors in decoding
128 | self.byte_encoder = bytes_to_unicode()
129 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
130 | bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
131 | bpe_merges = [tuple(merge.split()) for merge in bpe_data]
132 | self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
133 | self.cache = {}
134 |
135 | # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
136 | self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
137 |
138 | @property
139 | def vocab_size(self):
140 | return len(self.encoder)
141 |
142 | def bpe(self, token):
143 | if token in self.cache:
144 | return self.cache[token]
145 | word = tuple(token)
146 | pairs = get_pairs(word)
147 |
148 | if not pairs:
149 | return token
150 |
151 | while True:
152 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
153 | if bigram not in self.bpe_ranks:
154 | break
155 | first, second = bigram
156 | new_word = []
157 | i = 0
158 | while i < len(word):
159 | try:
160 | j = word.index(first, i)
161 | new_word.extend(word[i:j])
162 | i = j
163 | except:
164 | new_word.extend(word[i:])
165 | break
166 |
167 | if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
168 | new_word.append(first + second)
169 | i += 2
170 | else:
171 | new_word.append(word[i])
172 | i += 1
173 | new_word = tuple(new_word)
174 | word = new_word
175 | if len(word) == 1:
176 | break
177 | else:
178 | pairs = get_pairs(word)
179 | word = ' '.join(word)
180 | self.cache[token] = word
181 | return word
182 |
183 | def _tokenize(self, text, add_prefix_space=False):
184 | """ Tokenize a string.
185 | Args:
186 | - add_prefix_space (boolean, default False):
187 | Begin the sentence with at least one space toto get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
188 | """
189 | if add_prefix_space:
190 | text = ' ' + text
191 |
192 | bpe_tokens = []
193 | for token in re.findall(self.pat, text):
194 | if sys.version_info[0] == 2:
195 | token = ''.join(self.byte_encoder[ord(b)] for b in
196 | token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
197 | else:
198 | token = ''.join(self.byte_encoder[b] for b in token.encode(
199 | 'utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
200 | bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
201 | return bpe_tokens
202 |
203 | def _convert_token_to_id(self, token):
204 | """ Converts a token (str/unicode) in an id using the vocab. """
205 | return self.encoder.get(token, self.encoder.get(self.unk_token))
206 |
207 | def _convert_id_to_token(self, index):
208 | """Converts an index (integer) in a token (string/unicode) using the vocab."""
209 | return self.decoder.get(index)
210 |
211 | def convert_tokens_to_string(self, tokens):
212 | """ Converts a sequence of tokens (string) in a single string. """
213 | text = ''.join(tokens)
214 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
215 | return text
216 |
217 | def save_vocabulary(self, save_directory):
218 | """Save the tokenizer vocabulary and merge files to a directory."""
219 | if not os.path.isdir(save_directory):
220 | logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
221 | return
222 | vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
223 | merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
224 |
225 | with open(vocab_file, 'w', encoding='utf-8') as f:
226 | f.write(json.dumps(self.encoder, ensure_ascii=False))
227 |
228 | index = 0
229 | with open(merge_file, "w", encoding="utf-8") as writer:
230 | writer.write(u'#version: 0.2\n')
231 | for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
232 | if index != token_index:
233 | logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
234 | " Please check that the tokenizer is not corrupted!".format(merge_file))
235 | index = token_index
236 | writer.write(' '.join(bpe_tokens) + u'\n')
237 | index += 1
238 |
239 | return vocab_file, merge_file
--------------------------------------------------------------------------------
/configuration_utils1.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ Configuration base class and utilities."""
17 |
18 | from __future__ import (absolute_import, division, print_function,
19 | unicode_literals)
20 |
21 | import copy
22 | import json
23 | import logging
24 | import os
25 | from io import open
26 |
27 | from file_utils1 import cached_path, CONFIG_NAME
28 |
29 | logger = logging.getLogger(__name__)
30 |
31 | class PretrainedConfig(object):
32 | r""" Base class for all configuration classes.
33 | Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
34 |
35 | Note:
36 | A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
37 | It only affects the model's configuration.
38 |
39 | Class attributes (overridden by derived classes):
40 | - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
41 |
42 | Parameters:
43 | ``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
44 | ``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
45 | ``output_attentions``: boolean, default `False`. Should the model returns attentions weights.
46 | ``output_hidden_states``: string, default `False`. Should the model returns all hidden-states.
47 | ``torchscript``: string, default `False`. Is the model used with Torchscript.
48 | """
49 | pretrained_config_archive_map = {}
50 |
51 | def __init__(self, **kwargs):
52 | self.finetuning_task = kwargs.pop('finetuning_task', None)
53 | self.num_labels = kwargs.pop('num_labels', 2)
54 | self.output_attentions = kwargs.pop('output_attentions', False)
55 | self.output_hidden_states = kwargs.pop('output_hidden_states', False)
56 | self.output_past = kwargs.pop('output_past', True) # Not used by all models
57 | self.torchscript = kwargs.pop('torchscript', False) # Only used by PyTorch models
58 | self.use_bfloat16 = kwargs.pop('use_bfloat16', False)
59 | self.pruned_heads = kwargs.pop('pruned_heads', {})
60 |
61 | def save_pretrained(self, save_directory):
62 | """ Save a configuration object to the directory `save_directory`, so that it
63 | can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
64 | """
65 | assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
66 |
67 | # If we save using the predefined names, we can load using `from_pretrained`
68 | output_config_file = os.path.join(save_directory, CONFIG_NAME)
69 |
70 | self.to_json_file(output_config_file)
71 | logger.info("Configuration saved in {}".format(output_config_file))
72 |
73 | @classmethod
74 | def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
75 | r""" Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
76 |
77 | Parameters:
78 | pretrained_model_name_or_path: either:
79 |
80 | - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
81 | - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
82 | - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
83 |
84 | cache_dir: (`optional`) string:
85 | Path to a directory in which a downloaded pre-trained model
86 | configuration should be cached if the standard cache should not be used.
87 |
88 | kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
89 |
90 | - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
91 | - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
92 |
93 | force_download: (`optional`) boolean, default False:
94 | Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
95 |
96 | proxies: (`optional`) dict, default None:
97 | A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
98 | The proxies are used on each request.
99 |
100 | return_unused_kwargs: (`optional`) bool:
101 |
102 | - If False, then this function returns just the final configuration object.
103 | - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
104 |
105 | Examples::
106 |
107 | # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
108 | # derived class: BertConfig
109 | config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
110 | config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
111 | config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
112 | config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
113 | assert config.output_attention == True
114 | config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
115 | foo=False, return_unused_kwargs=True)
116 | assert config.output_attention == True
117 | assert unused_kwargs == {'foo': False}
118 |
119 | """
120 | cache_dir = kwargs.pop('cache_dir', None)
121 | force_download = kwargs.pop('force_download', False)
122 | proxies = kwargs.pop('proxies', None)
123 | return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
124 |
125 | if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
126 | config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
127 | elif os.path.isdir(pretrained_model_name_or_path):
128 | config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
129 | else:
130 | config_file = pretrained_model_name_or_path
131 | # redirect to the cache, if necessary
132 | try:
133 | resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
134 | except EnvironmentError:
135 | if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
136 | msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
137 | config_file)
138 | else:
139 | msg = "Model name '{}' was not found in model name list ({}). " \
140 | "We assumed '{}' was a path or url to a configuration file named {} or " \
141 | "a directory containing such a file but couldn't find any such file at this path or url.".format(
142 | pretrained_model_name_or_path,
143 | ', '.join(cls.pretrained_config_archive_map.keys()),
144 | config_file, CONFIG_NAME)
145 | raise EnvironmentError(msg)
146 |
147 | if resolved_config_file == config_file:
148 | logger.info("loading configuration file {}".format(config_file))
149 | else:
150 | logger.info("loading configuration file {} from cache at {}".format(
151 | config_file, resolved_config_file))
152 |
153 | # Load config
154 | config = cls.from_json_file(resolved_config_file)
155 |
156 | if hasattr(config, 'pruned_heads'):
157 | config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
158 |
159 | # Update config with kwargs if needed
160 | to_remove = []
161 | for key, value in kwargs.items():
162 | if hasattr(config, key):
163 | setattr(config, key, value)
164 | to_remove.append(key)
165 | for key in to_remove:
166 | kwargs.pop(key, None)
167 |
168 | logger.info("Model config %s", str(config))
169 | if return_unused_kwargs:
170 | return config, kwargs
171 | else:
172 | return config
173 |
174 | @classmethod
175 | def from_dict(cls, json_object):
176 | """Constructs a `Config` from a Python dictionary of parameters."""
177 | config = cls(vocab_size_or_config_json_file=-1)
178 | for key, value in json_object.items():
179 | setattr(config, key, value)
180 | return config
181 |
182 | @classmethod
183 | def from_json_file(cls, json_file):
184 | """Constructs a `BertConfig` from a json file of parameters."""
185 | with open(json_file, "r", encoding='utf-8') as reader:
186 | text = reader.read()
187 | return cls.from_dict(json.loads(text))
188 |
189 | def __eq__(self, other):
190 | return self.__dict__ == other.__dict__
191 |
192 | def __repr__(self):
193 | return str(self.to_json_string())
194 |
195 | def to_dict(self):
196 | """Serializes this instance to a Python dictionary."""
197 | output = copy.deepcopy(self.__dict__)
198 | return output
199 |
200 | def to_json_string(self):
201 | """Serializes this instance to a JSON string."""
202 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
203 |
204 | def to_json_file(self, json_file_path):
205 | """ Save this instance to a json file."""
206 | with open(json_file_path, "w", encoding='utf-8') as writer:
207 | writer.write(self.to_json_string())
208 |
--------------------------------------------------------------------------------
/file_utils1.py:
--------------------------------------------------------------------------------
1 | """
2 | Utilities for working with the local dataset cache.
3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
4 | Copyright by the AllenNLP authors.
5 | """
6 | from __future__ import (absolute_import, division, print_function, unicode_literals)
7 |
8 | import sys
9 | import json
10 | import logging
11 | import os
12 | import six
13 | import shutil
14 | import tempfile
15 | import fnmatch
16 | from functools import wraps
17 | from hashlib import sha256
18 | from io import open
19 |
20 | import boto3
21 | from botocore.config import Config
22 | from botocore.exceptions import ClientError
23 | import requests
24 | from tqdm import tqdm
25 |
26 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name
27 |
28 | try:
29 | import tensorflow as tf
30 | assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2
31 | _tf_available = True # pylint: disable=invalid-name
32 | logger.info("TensorFlow version {} available.".format(tf.__version__))
33 | except (ImportError, AssertionError):
34 | _tf_available = False # pylint: disable=invalid-name
35 |
36 | try:
37 | import torch
38 | _torch_available = True # pylint: disable=invalid-name
39 | logger.info("PyTorch version {} available.".format(torch.__version__))
40 | except ImportError:
41 | _torch_available = False # pylint: disable=invalid-name
42 |
43 |
44 | try:
45 | from torch.hub import _get_torch_home
46 | torch_cache_home = _get_torch_home()
47 | except ImportError:
48 | torch_cache_home = os.path.expanduser(
49 | os.getenv('TORCH_HOME', os.path.join(
50 | os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
51 | default_cache_path = os.path.join(torch_cache_home, 'transformers')
52 |
53 | try:
54 | from urllib.parse import urlparse
55 | except ImportError:
56 | from urlparse import urlparse
57 |
58 | try:
59 | from pathlib import Path
60 | PYTORCH_PRETRAINED_BERT_CACHE = Path(
61 | os.getenv('PYTORCH_TRANSFORMERS_CACHE', os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path)))
62 | except (AttributeError, ImportError):
63 | PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_TRANSFORMERS_CACHE',
64 | os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
65 | default_cache_path))
66 |
67 | PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
68 | TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
69 |
70 | WEIGHTS_NAME = "pytorch_model.bin"
71 | TF2_WEIGHTS_NAME = 'tf_model.h5'
72 | TF_WEIGHTS_NAME = 'model.ckpt'
73 | CONFIG_NAME = "config.json"
74 |
75 | def is_torch_available():
76 | return _torch_available
77 |
78 | def is_tf_available():
79 | return _tf_available
80 |
81 | if not six.PY2:
82 | def add_start_docstrings(*docstr):
83 | def docstring_decorator(fn):
84 | fn.__doc__ = ''.join(docstr) + fn.__doc__
85 | return fn
86 | return docstring_decorator
87 |
88 | def add_end_docstrings(*docstr):
89 | def docstring_decorator(fn):
90 | fn.__doc__ = fn.__doc__ + ''.join(docstr)
91 | return fn
92 | return docstring_decorator
93 | else:
94 | # Not possible to update class docstrings on python2
95 | def add_start_docstrings(*docstr):
96 | def docstring_decorator(fn):
97 | return fn
98 | return docstring_decorator
99 |
100 | def add_end_docstrings(*docstr):
101 | def docstring_decorator(fn):
102 | return fn
103 | return docstring_decorator
104 |
105 | def url_to_filename(url, etag=None):
106 | """
107 | Convert `url` into a hashed filename in a repeatable way.
108 | If `etag` is specified, append its hash to the url's, delimited
109 | by a period.
110 | If the url ends with .h5 (Keras HDF5 weights) ands '.h5' to the name
111 | so that TF 2.0 can identify it as a HDF5 file
112 | (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
113 | """
114 | url_bytes = url.encode('utf-8')
115 | url_hash = sha256(url_bytes)
116 | filename = url_hash.hexdigest()
117 |
118 | if etag:
119 | etag_bytes = etag.encode('utf-8')
120 | etag_hash = sha256(etag_bytes)
121 | filename += '.' + etag_hash.hexdigest()
122 |
123 | if url.endswith('.h5'):
124 | filename += '.h5'
125 |
126 | return filename
127 |
128 |
129 | def filename_to_url(filename, cache_dir=None):
130 | """
131 | Return the url and etag (which may be ``None``) stored for `filename`.
132 | Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
133 | """
134 | if cache_dir is None:
135 | cache_dir = TRANSFORMERS_CACHE
136 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
137 | cache_dir = str(cache_dir)
138 |
139 | cache_path = os.path.join(cache_dir, filename)
140 | if not os.path.exists(cache_path):
141 | raise EnvironmentError("file {} not found".format(cache_path))
142 |
143 | meta_path = cache_path + '.json'
144 | if not os.path.exists(meta_path):
145 | raise EnvironmentError("file {} not found".format(meta_path))
146 |
147 | with open(meta_path, encoding="utf-8") as meta_file:
148 | metadata = json.load(meta_file)
149 | url = metadata['url']
150 | etag = metadata['etag']
151 |
152 | return url, etag
153 |
154 |
155 | def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None):
156 | """
157 | Given something that might be a URL (or might be a local path),
158 | determine which. If it's a URL, download the file and cache it, and
159 | return the path to the cached file. If it's already a local path,
160 | make sure the file exists and then return the path.
161 | Args:
162 | cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
163 | force_download: if True, re-dowload the file even if it's already cached in the cache dir.
164 | """
165 | if cache_dir is None:
166 | cache_dir = TRANSFORMERS_CACHE
167 | if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
168 | url_or_filename = str(url_or_filename)
169 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
170 | cache_dir = str(cache_dir)
171 |
172 | parsed = urlparse(url_or_filename)
173 |
174 | if parsed.scheme in ('http', 'https', 's3'):
175 | # URL, so get it from the cache (downloading if necessary)
176 | return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
177 | elif os.path.exists(url_or_filename):
178 | # File, and it exists.
179 | return url_or_filename
180 | elif parsed.scheme == '':
181 | # File, but it doesn't exist.
182 | raise EnvironmentError("file {} not found".format(url_or_filename))
183 | else:
184 | # Something unknown
185 | raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
186 |
187 |
188 | def split_s3_path(url):
189 | """Split a full s3 path into the bucket name and path."""
190 | parsed = urlparse(url)
191 | if not parsed.netloc or not parsed.path:
192 | raise ValueError("bad s3 path {}".format(url))
193 | bucket_name = parsed.netloc
194 | s3_path = parsed.path
195 | # Remove '/' at beginning of path.
196 | if s3_path.startswith("/"):
197 | s3_path = s3_path[1:]
198 | return bucket_name, s3_path
199 |
200 |
201 | def s3_request(func):
202 | """
203 | Wrapper function for s3 requests in order to create more helpful error
204 | messages.
205 | """
206 |
207 | @wraps(func)
208 | def wrapper(url, *args, **kwargs):
209 | try:
210 | return func(url, *args, **kwargs)
211 | except ClientError as exc:
212 | if int(exc.response["Error"]["Code"]) == 404:
213 | raise EnvironmentError("file {} not found".format(url))
214 | else:
215 | raise
216 |
217 | return wrapper
218 |
219 |
220 | @s3_request
221 | def s3_etag(url, proxies=None):
222 | """Check ETag on S3 object."""
223 | s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
224 | bucket_name, s3_path = split_s3_path(url)
225 | s3_object = s3_resource.Object(bucket_name, s3_path)
226 | return s3_object.e_tag
227 |
228 |
229 | @s3_request
230 | def s3_get(url, temp_file, proxies=None):
231 | """Pull a file directly from S3."""
232 | s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
233 | bucket_name, s3_path = split_s3_path(url)
234 | s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
235 |
236 |
237 | def http_get(url, temp_file, proxies=None):
238 | req = requests.get(url, stream=True, proxies=proxies)
239 | content_length = req.headers.get('Content-Length')
240 | total = int(content_length) if content_length is not None else None
241 | progress = tqdm(unit="B", total=total)
242 | for chunk in req.iter_content(chunk_size=1024):
243 | if chunk: # filter out keep-alive new chunks
244 | progress.update(len(chunk))
245 | temp_file.write(chunk)
246 | progress.close()
247 |
248 |
249 | def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
250 | """
251 | Given a URL, look for the corresponding dataset in the local cache.
252 | If it's not there, download it. Then return the path to the cached file.
253 | """
254 | if cache_dir is None:
255 | cache_dir = TRANSFORMERS_CACHE
256 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
257 | cache_dir = str(cache_dir)
258 | if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
259 | cache_dir = str(cache_dir)
260 |
261 | if not os.path.exists(cache_dir):
262 | os.makedirs(cache_dir)
263 |
264 | # Get eTag to add to filename, if it exists.
265 | if url.startswith("s3://"):
266 | etag = s3_etag(url, proxies=proxies)
267 | else:
268 | try:
269 | response = requests.head(url, allow_redirects=True, proxies=proxies)
270 | if response.status_code != 200:
271 | etag = None
272 | else:
273 | etag = response.headers.get("ETag")
274 | except EnvironmentError:
275 | etag = None
276 |
277 | if sys.version_info[0] == 2 and etag is not None:
278 | etag = etag.decode('utf-8')
279 | filename = url_to_filename(url, etag)
280 |
281 | # get cache path to put the file
282 | cache_path = os.path.join(cache_dir, filename)
283 |
284 | # If we don't have a connection (etag is None) and can't identify the file
285 | # try to get the last downloaded one
286 | if not os.path.exists(cache_path) and etag is None:
287 | matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
288 | matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))
289 | if matching_files:
290 | cache_path = os.path.join(cache_dir, matching_files[-1])
291 |
292 | if not os.path.exists(cache_path) or force_download:
293 | # Download to temporary file, then copy to cache dir once finished.
294 | # Otherwise you get corrupt cache entries if the download gets interrupted.
295 | with tempfile.NamedTemporaryFile() as temp_file:
296 | logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
297 |
298 | # GET file object
299 | if url.startswith("s3://"):
300 | s3_get(url, temp_file, proxies=proxies)
301 | else:
302 | http_get(url, temp_file, proxies=proxies)
303 |
304 | # we are copying the file before closing it, so flush to avoid truncation
305 | temp_file.flush()
306 | # shutil.copyfileobj() starts at the current position, so go to the start
307 | temp_file.seek(0)
308 |
309 | logger.info("copying %s to cache at %s", temp_file.name, cache_path)
310 | with open(cache_path, 'wb') as cache_file:
311 | shutil.copyfileobj(temp_file, cache_file)
312 |
313 | logger.info("creating metadata file for %s", cache_path)
314 | meta = {'url': url, 'etag': etag}
315 | meta_path = cache_path + '.json'
316 | with open(meta_path, 'w') as meta_file:
317 | output_string = json.dumps(meta)
318 | if sys.version_info[0] == 2 and isinstance(output_string, str):
319 | output_string = unicode(output_string, 'utf-8') # The beauty of python 2
320 | meta_file.write(output_string)
321 |
322 | logger.info("removing temp file %s", temp_file.name)
323 |
324 | return cache_path
325 |
--------------------------------------------------------------------------------
/glue1.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ GLUE processors and helpers """
17 |
18 | import logging
19 | import os
20 | import numpy as np
21 | from utils import DataProcessor, InputExample, InputFeatures
22 | from file_utils1 import is_tf_available
23 |
24 | if is_tf_available():
25 | import tensorflow as tf
26 |
27 | logger = logging.getLogger(__name__)
28 |
29 |
30 | def glue_convert_examples_to_features(examples, tokenizer,
31 | max_length=512,
32 | task=None,
33 | label_list=None,
34 | output_mode=None,
35 | pad_on_left=False,
36 | pad_token=0,
37 | pad_token_segment_id=0,
38 | mask_padding_with_zero=True):
39 | """
40 | Loads a data file into a list of ``InputFeatures``
41 |
42 | Args:
43 | examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
44 | tokenizer: Instance of a tokenizer that will tokenize the examples
45 | max_length: Maximum example length
46 | task: GLUE task
47 | label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
48 | output_mode: String indicating the output mode. Either ``regression`` or ``classification``
49 | pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
50 | pad_token: Padding token
51 | pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
52 | mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
53 | and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
54 | actual values)
55 |
56 | Returns:
57 | If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
58 | containing the task-specific features. If the input is a list of ``InputExamples``, will return
59 | a list of task-specific ``InputFeatures`` which can be fed to the model.
60 |
61 | """
62 | is_tf_dataset = False
63 | if is_tf_available() and isinstance(examples, tf.data.Dataset):
64 | is_tf_dataset = True
65 |
66 | if task is not None:
67 | processor = glue_processors[task]()
68 | if label_list is None:
69 | label_list = processor.get_labels()
70 | logger.info("Using label list %s for task %s" % (label_list, task))
71 | if output_mode is None:
72 | output_mode = glue_output_modes[task]
73 | logger.info("Using output mode %s for task %s" % (output_mode, task))
74 |
75 | label_map = {label: i for i, label in enumerate(label_list)}
76 |
77 | features = []
78 | for (ex_index, example) in enumerate(examples):
79 | if ex_index % 10000 == 0:
80 | logger.info("Writing example %d" % (ex_index))
81 | if is_tf_dataset:
82 | example = processor.get_example_from_tensor_dict(example)
83 |
84 | inputs = tokenizer.encode_plus(
85 | example.text_a,
86 | example.text_b,
87 | add_special_tokens=True,
88 | max_length=max_length,
89 | )
90 | input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
91 |
92 | # The mask has 1 for real tokens and 0 for padding tokens. Only real
93 | # tokens are attended to.
94 | attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
95 |
96 | # Zero-pad up to the sequence length.
97 | padding_length = max_length - len(input_ids)
98 | if pad_on_left:
99 | input_ids = ([pad_token] * padding_length) + input_ids
100 | attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
101 | token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
102 | else:
103 | input_ids = input_ids + ([pad_token] * padding_length)
104 | attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
105 | token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
106 |
107 | assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
108 | assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
109 | assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
110 |
111 | if output_mode == "classification":
112 | label = label_map[example.label]
113 | elif output_mode == "regression":
114 | label = example.label
115 | elif output_mode == "MultiLabelclassification":
116 | label = example.label
117 | else:
118 | raise KeyError(output_mode)
119 |
120 | if ex_index < 5:
121 | logger.info("*** Example ***")
122 | logger.info("guid: %s" % (example.guid))
123 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
124 | logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
125 | logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
126 | # logger.info("label: %s (id = %d)" % (example.label, label))
127 |
128 | features.append(
129 | InputFeatures(input_ids=input_ids,
130 | attention_mask=attention_mask,
131 | token_type_ids=token_type_ids,
132 | label=label))
133 |
134 | if is_tf_available() and is_tf_dataset:
135 | def gen():
136 | for ex in features:
137 | yield ({'input_ids': ex.input_ids,
138 | 'attention_mask': ex.attention_mask,
139 | 'token_type_ids': ex.token_type_ids},
140 | ex.label)
141 |
142 | return tf.data.Dataset.from_generator(gen,
143 | ({'input_ids': tf.int32,
144 | 'attention_mask': tf.int32,
145 | 'token_type_ids': tf.int32},
146 | tf.int64),
147 | ({'input_ids': tf.TensorShape([None]),
148 | 'attention_mask': tf.TensorShape([None]),
149 | 'token_type_ids': tf.TensorShape([None])},
150 | tf.TensorShape([])))
151 |
152 | return features
153 |
154 |
155 | class MrpcProcessor(DataProcessor):
156 | """Processor for the MRPC data set (GLUE version)."""
157 |
158 | def get_example_from_tensor_dict(self, tensor_dict):
159 | """See base class."""
160 | return InputExample(tensor_dict['idx'].numpy(),
161 | tensor_dict['sentence1'].numpy().decode('utf-8'),
162 | tensor_dict['sentence2'].numpy().decode('utf-8'),
163 | str(tensor_dict['label'].numpy()))
164 |
165 | def get_train_examples(self, data_dir):
166 | """See base class."""
167 | logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
168 | return self._create_examples(
169 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
170 |
171 | def get_dev_examples(self, data_dir):
172 | """See base class."""
173 | return self._create_examples(
174 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
175 |
176 | def get_labels(self):
177 | """See base class."""
178 | return ["0", "1"]
179 |
180 | def _create_examples(self, lines, set_type):
181 | """Creates examples for the training and dev sets."""
182 | examples = []
183 | for (i, line) in enumerate(lines):
184 | if i == 0:
185 | continue
186 | guid = "%s-%s" % (set_type, i)
187 | text_a = line[3]
188 | text_b = line[4]
189 | label = line[0]
190 | examples.append(
191 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
192 | return examples
193 |
194 | class StsbProcessor(DataProcessor):
195 | """Processor for the STS-B data set (GLUE version)."""
196 |
197 | def get_example_from_tensor_dict(self, tensor_dict):
198 | """See base class."""
199 | return InputExample(tensor_dict['idx'].numpy(),
200 | tensor_dict['sentence1'].numpy().decode('utf-8'),
201 | tensor_dict['sentence2'].numpy().decode('utf-8'),
202 | str(tensor_dict['label'].numpy()))
203 |
204 | def get_train_examples(self, data_dir):
205 | """See base class."""
206 | return self._create_examples(
207 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
208 |
209 | def get_dev_examples(self, data_dir):
210 | """See base class."""
211 | return self._create_examples(
212 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
213 |
214 | def get_labels(self):
215 | """See base class."""
216 | return [None]
217 |
218 | def _create_examples(self, lines, set_type):
219 | """Creates examples for the training and dev sets."""
220 | examples = []
221 | for (i, line) in enumerate(lines):
222 | if i == 0:
223 | continue
224 | guid = "%s-%s" % (set_type, line[0])
225 | text_a = line[7]
226 | text_b = line[8]
227 | label = line[-1]
228 | examples.append(
229 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
230 | return examples
231 |
232 | class MultiLabelProcessor(DataProcessor):
233 | """Processor for the MultiNLI data set (GLUE version)."""
234 |
235 | def get_example_from_tensor_dict(self, tensor_dict):
236 | """See base class."""
237 | return InputExample(tensor_dict['idx'].numpy(),
238 | tensor_dict['premise'].numpy().decode('utf-8'),
239 | tensor_dict['hypothesis'].numpy().decode('utf-8'),
240 | str(tensor_dict['label'].numpy()))
241 |
242 | def get_train_examples(self, data_dir):
243 | """See base class."""
244 | return self._create_examples(
245 | self._read_tsv(os.path.join(data_dir, "train_data.tsv")), "train")
246 |
247 | def get_dev_examples(self, data_dir):
248 | """See base class."""
249 | return self._create_examples(
250 | self._read_tsv(os.path.join(data_dir, "eval_data.tsv")),
251 | "dev_matched")
252 |
253 | def get_labels(self):
254 | """See base class."""
255 | return ["0", "1", "2","3","4","5","6", "7", "8","9","10","11","12", "13", "14","15"]
256 |
257 | def _create_examples(self, lines, set_type):
258 | """Creates examples for the training and dev sets."""
259 | examples = []
260 | for (i, line) in enumerate(lines):
261 | guid = "%s" % (set_type)
262 | text_a = line[0]
263 | label=np.zeros((16,), dtype=int)
264 | label_sum=["inform_theater","inform_starttime","inform_numberofpeople","greeting","thanks","inform_other","request_moviename","inform_genre","request_ticket",
265 | "inform_city","inform_state","inform_date","inform_moviename","confirm_answer","inform_zip","inform_video_format"]
266 | for i in range(16):
267 | if label_sum[i] in line:
268 | label[i]=1
269 | examples.append(
270 | InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
271 |
272 | return examples
273 |
274 | glue_tasks_num_labels = {
275 | "mrpc": 2,
276 | "sts-b": 1,
277 | "multilabel":16,
278 | }
279 |
280 | glue_processors = {
281 | "mrpc": MrpcProcessor,
282 | "sts-b": StsbProcessor,
283 | "multilabel": MultiLabelProcessor,
284 | }
285 |
286 | glue_output_modes = {
287 | "mrpc": "classification",
288 | "sts-b": "regression",
289 | "multilabel": "MultiLabelclassification",
290 | }
291 |
--------------------------------------------------------------------------------
/tokenization_bert1.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes."""
16 |
17 | from __future__ import absolute_import, division, print_function, unicode_literals
18 |
19 | import collections
20 | import logging
21 | import os
22 | import unicodedata
23 | from io import open
24 |
25 | from tokenization_utils1 import PreTrainedTokenizer
26 |
27 | logger = logging.getLogger(__name__)
28 |
29 | VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
30 |
31 | PRETRAINED_VOCAB_FILES_MAP = {
32 | 'vocab_file':
33 | {
34 | 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
35 | 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
36 | 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
37 | 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
38 | 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
39 | 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
40 | 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
41 | 'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
42 | 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
43 | 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
44 | 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
45 | 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
46 | 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
47 | 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
48 | 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
49 | }
50 | }
51 |
52 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
53 | 'bert-base-uncased': 512,
54 | 'bert-large-uncased': 512,
55 | 'bert-base-cased': 512,
56 | 'bert-large-cased': 512,
57 | 'bert-base-multilingual-uncased': 512,
58 | 'bert-base-multilingual-cased': 512,
59 | 'bert-base-chinese': 512,
60 | 'bert-base-german-cased': 512,
61 | 'bert-large-uncased-whole-word-masking': 512,
62 | 'bert-large-cased-whole-word-masking': 512,
63 | 'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
64 | 'bert-large-cased-whole-word-masking-finetuned-squad': 512,
65 | 'bert-base-cased-finetuned-mrpc': 512,
66 | 'bert-base-german-dbmdz-cased': 512,
67 | 'bert-base-german-dbmdz-uncased': 512,
68 | }
69 |
70 | PRETRAINED_INIT_CONFIGURATION = {
71 | 'bert-base-uncased': {'do_lower_case': True},
72 | 'bert-large-uncased': {'do_lower_case': True},
73 | 'bert-base-cased': {'do_lower_case': False},
74 | 'bert-large-cased': {'do_lower_case': False},
75 | 'bert-base-multilingual-uncased': {'do_lower_case': True},
76 | 'bert-base-multilingual-cased': {'do_lower_case': False},
77 | 'bert-base-chinese': {'do_lower_case': False},
78 | 'bert-base-german-cased': {'do_lower_case': False},
79 | 'bert-large-uncased-whole-word-masking': {'do_lower_case': True},
80 | 'bert-large-cased-whole-word-masking': {'do_lower_case': False},
81 | 'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True},
82 | 'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False},
83 | 'bert-base-cased-finetuned-mrpc': {'do_lower_case': False},
84 | 'bert-base-german-dbmdz-cased': {'do_lower_case': False},
85 | 'bert-base-german-dbmdz-uncased': {'do_lower_case': True},
86 | }
87 |
88 |
89 | def load_vocab(vocab_file):
90 | """Loads a vocabulary file into a dictionary."""
91 | vocab = collections.OrderedDict()
92 | with open(vocab_file, "r", encoding="utf-8") as reader:
93 | tokens = reader.readlines()
94 | for index, token in enumerate(tokens):
95 | token = token.rstrip('\n')
96 | vocab[token] = index
97 | return vocab
98 |
99 |
100 | def whitespace_tokenize(text):
101 | """Runs basic whitespace cleaning and splitting on a piece of text."""
102 | text = text.strip()
103 | if not text:
104 | return []
105 | tokens = text.split()
106 | return tokens
107 |
108 |
109 | class BertTokenizer(PreTrainedTokenizer):
110 | r"""
111 | Constructs a BertTokenizer.
112 | :class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
113 |
114 | Args:
115 | vocab_file: Path to a one-wordpiece-per-line vocabulary file
116 | do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
117 | do_basic_tokenize: Whether to do basic tokenization before wordpiece.
118 | max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
119 | minimum of this value (if specified) and the underlying BERT model's sequence length.
120 | never_split: List of tokens which will never be split during tokenization. Only has an effect when
121 | do_wordpiece_only=False
122 | """
123 |
124 | vocab_files_names = VOCAB_FILES_NAMES
125 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
126 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
127 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
128 |
129 | def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
130 | unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
131 | mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs):
132 | """Constructs a BertTokenizer.
133 |
134 | Args:
135 | **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
136 | **do_lower_case**: (`optional`) boolean (default True)
137 | Whether to lower case the input
138 | Only has an effect when do_basic_tokenize=True
139 | **do_basic_tokenize**: (`optional`) boolean (default True)
140 | Whether to do basic tokenization before wordpiece.
141 | **never_split**: (`optional`) list of string
142 | List of tokens which will never be split during tokenization.
143 | Only has an effect when do_basic_tokenize=True
144 | **tokenize_chinese_chars**: (`optional`) boolean (default True)
145 | Whether to tokenize Chinese characters.
146 | This should likely be deactivated for Japanese:
147 | see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
148 | """
149 | super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
150 | pad_token=pad_token, cls_token=cls_token,
151 | mask_token=mask_token, **kwargs)
152 | self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
153 | self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
154 |
155 | if not os.path.isfile(vocab_file):
156 | raise ValueError(
157 | "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
158 | "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
159 | self.vocab = load_vocab(vocab_file)
160 | self.ids_to_tokens = collections.OrderedDict(
161 | [(ids, tok) for tok, ids in self.vocab.items()])
162 | self.do_basic_tokenize = do_basic_tokenize
163 | if do_basic_tokenize:
164 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
165 | never_split=never_split,
166 | tokenize_chinese_chars=tokenize_chinese_chars)
167 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
168 |
169 | @property
170 | def vocab_size(self):
171 | return len(self.vocab)
172 |
173 | def _tokenize(self, text):
174 | split_tokens = []
175 | if self.do_basic_tokenize:
176 | for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
177 | for sub_token in self.wordpiece_tokenizer.tokenize(token):
178 | split_tokens.append(sub_token)
179 | else:
180 | split_tokens = self.wordpiece_tokenizer.tokenize(text)
181 | return split_tokens
182 |
183 | def _convert_token_to_id(self, token):
184 | """ Converts a token (str/unicode) in an id using the vocab. """
185 | return self.vocab.get(token, self.vocab.get(self.unk_token))
186 |
187 | def _convert_id_to_token(self, index):
188 | """Converts an index (integer) in a token (string/unicode) using the vocab."""
189 | return self.ids_to_tokens.get(index, self.unk_token)
190 |
191 | def convert_tokens_to_string(self, tokens):
192 | """ Converts a sequence of tokens (string) in a single string. """
193 | out_string = ' '.join(tokens).replace(' ##', '').strip()
194 | return out_string
195 |
196 | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
197 | """
198 | Build model inputs from a sequence or a pair of sequence for sequence classification tasks
199 | by concatenating and adding special tokens.
200 | A BERT sequence has the following format:
201 | single sequence: [CLS] X [SEP]
202 | pair of sequences: [CLS] A [SEP] B [SEP]
203 | """
204 | if token_ids_1 is None:
205 | return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
206 | cls = [self.cls_token_id]
207 | sep = [self.sep_token_id]
208 | return cls + token_ids_0 + sep + token_ids_1 + sep
209 |
210 | def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
211 | """
212 | Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
213 | special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
214 |
215 | Args:
216 | token_ids_0: list of ids (must not contain special tokens)
217 | token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
218 | for sequence pairs
219 | already_has_special_tokens: (default False) Set to True if the token list is already formated with
220 | special tokens for the model
221 |
222 | Returns:
223 | A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
224 | """
225 |
226 | if already_has_special_tokens:
227 | if token_ids_1 is not None:
228 | raise ValueError("You should not supply a second sequence if the provided sequence of "
229 | "ids is already formated with special tokens for the model.")
230 | return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
231 |
232 | if token_ids_1 is not None:
233 | return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
234 | return [1] + ([0] * len(token_ids_0)) + [1]
235 |
236 | def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
237 | """
238 | Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
239 | A BERT sequence pair mask has the following format:
240 | 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
241 | | first sequence | second sequence
242 |
243 | if token_ids_1 is None, only returns the first portion of the mask (0's).
244 | """
245 | sep = [self.sep_token_id]
246 | cls = [self.cls_token_id]
247 | if token_ids_1 is None:
248 | return len(cls + token_ids_0 + sep) * [0]
249 | return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
250 |
251 | def save_vocabulary(self, vocab_path):
252 | """Save the tokenizer vocabulary to a directory or file."""
253 | index = 0
254 | if os.path.isdir(vocab_path):
255 | vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
256 | else:
257 | vocab_file = vocab_path
258 | with open(vocab_file, "w", encoding="utf-8") as writer:
259 | for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
260 | if index != token_index:
261 | logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
262 | " Please check that the vocabulary is not corrupted!".format(vocab_file))
263 | index = token_index
264 | writer.write(token + u'\n')
265 | index += 1
266 | return (vocab_file,)
267 |
268 |
269 | class BasicTokenizer(object):
270 | """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
271 |
272 | def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
273 | """ Constructs a BasicTokenizer.
274 |
275 | Args:
276 | **do_lower_case**: Whether to lower case the input.
277 | **never_split**: (`optional`) list of str
278 | Kept for backward compatibility purposes.
279 | Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
280 | List of token not to split.
281 | **tokenize_chinese_chars**: (`optional`) boolean (default True)
282 | Whether to tokenize Chinese characters.
283 | This should likely be deactivated for Japanese:
284 | see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
285 | """
286 | if never_split is None:
287 | never_split = []
288 | self.do_lower_case = do_lower_case
289 | self.never_split = never_split
290 | self.tokenize_chinese_chars = tokenize_chinese_chars
291 |
292 | def tokenize(self, text, never_split=None):
293 | """ Basic Tokenization of a piece of text.
294 | Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
295 |
296 | Args:
297 | **never_split**: (`optional`) list of str
298 | Kept for backward compatibility purposes.
299 | Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
300 | List of token not to split.
301 | """
302 | never_split = self.never_split + (never_split if never_split is not None else [])
303 | text = self._clean_text(text)
304 | # This was added on November 1st, 2018 for the multilingual and Chinese
305 | # models. This is also applied to the English models now, but it doesn't
306 | # matter since the English models were not trained on any Chinese data
307 | # and generally don't have any Chinese data in them (there are Chinese
308 | # characters in the vocabulary because Wikipedia does have some Chinese
309 | # words in the English Wikipedia.).
310 | if self.tokenize_chinese_chars:
311 | text = self._tokenize_chinese_chars(text)
312 | orig_tokens = whitespace_tokenize(text)
313 | split_tokens = []
314 | for token in orig_tokens:
315 | if self.do_lower_case and token not in never_split:
316 | token = token.lower()
317 | token = self._run_strip_accents(token)
318 | split_tokens.extend(self._run_split_on_punc(token))
319 |
320 | output_tokens = whitespace_tokenize(" ".join(split_tokens))
321 | return output_tokens
322 |
323 | def _run_strip_accents(self, text):
324 | """Strips accents from a piece of text."""
325 | text = unicodedata.normalize("NFD", text)
326 | output = []
327 | for char in text:
328 | cat = unicodedata.category(char)
329 | if cat == "Mn":
330 | continue
331 | output.append(char)
332 | return "".join(output)
333 |
334 | def _run_split_on_punc(self, text, never_split=None):
335 | """Splits punctuation on a piece of text."""
336 | if never_split is not None and text in never_split:
337 | return [text]
338 | chars = list(text)
339 | i = 0
340 | start_new_word = True
341 | output = []
342 | while i < len(chars):
343 | char = chars[i]
344 | if _is_punctuation(char):
345 | output.append([char])
346 | start_new_word = True
347 | else:
348 | if start_new_word:
349 | output.append([])
350 | start_new_word = False
351 | output[-1].append(char)
352 | i += 1
353 |
354 | return ["".join(x) for x in output]
355 |
356 | def _tokenize_chinese_chars(self, text):
357 | """Adds whitespace around any CJK character."""
358 | output = []
359 | for char in text:
360 | cp = ord(char)
361 | if self._is_chinese_char(cp):
362 | output.append(" ")
363 | output.append(char)
364 | output.append(" ")
365 | else:
366 | output.append(char)
367 | return "".join(output)
368 |
369 | def _is_chinese_char(self, cp):
370 | """Checks whether CP is the codepoint of a CJK character."""
371 | # This defines a "chinese character" as anything in the CJK Unicode block:
372 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
373 | #
374 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
375 | # despite its name. The modern Korean Hangul alphabet is a different block,
376 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write
377 | # space-separated words, so they are not treated specially and handled
378 | # like the all of the other languages.
379 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
380 | (cp >= 0x3400 and cp <= 0x4DBF) or #
381 | (cp >= 0x20000 and cp <= 0x2A6DF) or #
382 | (cp >= 0x2A700 and cp <= 0x2B73F) or #
383 | (cp >= 0x2B740 and cp <= 0x2B81F) or #
384 | (cp >= 0x2B820 and cp <= 0x2CEAF) or
385 | (cp >= 0xF900 and cp <= 0xFAFF) or #
386 | (cp >= 0x2F800 and cp <= 0x2FA1F)): #
387 | return True
388 |
389 | return False
390 |
391 | def _clean_text(self, text):
392 | """Performs invalid character removal and whitespace cleanup on text."""
393 | output = []
394 | for char in text:
395 | cp = ord(char)
396 | if cp == 0 or cp == 0xfffd or _is_control(char):
397 | continue
398 | if _is_whitespace(char):
399 | output.append(" ")
400 | else:
401 | output.append(char)
402 | return "".join(output)
403 |
404 |
405 | class WordpieceTokenizer(object):
406 | """Runs WordPiece tokenization."""
407 |
408 | def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
409 | self.vocab = vocab
410 | self.unk_token = unk_token
411 | self.max_input_chars_per_word = max_input_chars_per_word
412 |
413 | def tokenize(self, text):
414 | """Tokenizes a piece of text into its word pieces.
415 |
416 | This uses a greedy longest-match-first algorithm to perform tokenization
417 | using the given vocabulary.
418 |
419 | For example:
420 | input = "unaffable"
421 | output = ["un", "##aff", "##able"]
422 |
423 | Args:
424 | text: A single token or whitespace separated tokens. This should have
425 | already been passed through `BasicTokenizer`.
426 |
427 | Returns:
428 | A list of wordpiece tokens.
429 | """
430 |
431 | output_tokens = []
432 | for token in whitespace_tokenize(text):
433 | chars = list(token)
434 | if len(chars) > self.max_input_chars_per_word:
435 | output_tokens.append(self.unk_token)
436 | continue
437 |
438 | is_bad = False
439 | start = 0
440 | sub_tokens = []
441 | while start < len(chars):
442 | end = len(chars)
443 | cur_substr = None
444 | while start < end:
445 | substr = "".join(chars[start:end])
446 | if start > 0:
447 | substr = "##" + substr
448 | if substr in self.vocab:
449 | cur_substr = substr
450 | break
451 | end -= 1
452 | if cur_substr is None:
453 | is_bad = True
454 | break
455 | sub_tokens.append(cur_substr)
456 | start = end
457 |
458 | if is_bad:
459 | output_tokens.append(self.unk_token)
460 | else:
461 | output_tokens.extend(sub_tokens)
462 | return output_tokens
463 |
464 |
465 | def _is_whitespace(char):
466 | """Checks whether `chars` is a whitespace character."""
467 | # \t, \n, and \r are technically contorl characters but we treat them
468 | # as whitespace since they are generally considered as such.
469 | if char == " " or char == "\t" or char == "\n" or char == "\r":
470 | return True
471 | cat = unicodedata.category(char)
472 | if cat == "Zs":
473 | return True
474 | return False
475 |
476 |
477 | def _is_control(char):
478 | """Checks whether `chars` is a control character."""
479 | # These are technically control characters but we count them as whitespace
480 | # characters.
481 | if char == "\t" or char == "\n" or char == "\r":
482 | return False
483 | cat = unicodedata.category(char)
484 | if cat.startswith("C"):
485 | return True
486 | return False
487 |
488 |
489 | def _is_punctuation(char):
490 | """Checks whether `chars` is a punctuation character."""
491 | cp = ord(char)
492 | # We treat all non-letter/number ASCII as punctuation.
493 | # Characters such as "^", "$", and "`" are not in the Unicode
494 | # Punctuation class but we treat them as punctuation anyways, for
495 | # consistency.
496 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
497 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
498 | return True
499 | cat = unicodedata.category(char)
500 | if cat.startswith("P"):
501 | return True
502 | return False
503 |
--------------------------------------------------------------------------------
/modeling_roberta1.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """PyTorch RoBERTa model. """
17 |
18 | from __future__ import (absolute_import, division, print_function,
19 | unicode_literals)
20 |
21 | import logging
22 |
23 | import torch
24 | import torch.nn as nn
25 | from torch.nn import CrossEntropyLoss, MSELoss
26 |
27 | from modeling_bert1 import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
28 | from configuration_roberta1 import RobertaConfig
29 | from file_utils1 import add_start_docstrings
30 |
31 | logger = logging.getLogger(__name__)
32 |
33 | ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
34 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
35 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
36 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
37 | }
38 |
39 |
40 | class RobertaEmbeddings(BertEmbeddings):
41 | """
42 | Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
43 | """
44 |
45 | def __init__(self, config):
46 | super(RobertaEmbeddings, self).__init__(config)
47 | self.padding_idx = 1
48 | self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
49 | self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
50 | padding_idx=self.padding_idx)
51 |
52 | def forward(self, input_ids, token_type_ids=None, position_ids=None):
53 | seq_length = input_ids.size(1)
54 | if position_ids is None:
55 | # Position numbers begin at padding_idx+1. Padding symbols are ignored.
56 | # cf. fairseq's `utils.make_positions`
57 | position_ids = torch.arange(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=torch.long,
58 | device=input_ids.device)
59 | position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
60 | return super(RobertaEmbeddings, self).forward(input_ids,
61 | token_type_ids=token_type_ids,
62 | position_ids=position_ids)
63 |
64 |
65 | ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in
66 | `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_
67 | by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
68 | Veselin Stoyanov. It is based on Google's BERT model released in 2018.
69 |
70 | It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
71 | objective and training with much larger mini-batches and learning rates.
72 |
73 | This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained
74 | models.
75 |
76 | This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
77 | refer to the PyTorch documentation for all matter related to general usage and behavior.
78 |
79 | .. _`RoBERTa: A Robustly Optimized BERT Pretraining Approach`:
80 | https://arxiv.org/abs/1907.11692
81 |
82 | .. _`torch.nn.Module`:
83 | https://pytorch.org/docs/stable/nn.html#module
84 |
85 | Parameters:
86 | config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
87 | model. Initializing with a config file does not load the weights associated with the model, only the configuration.
88 | Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
89 | """
90 |
91 | ROBERTA_INPUTS_DOCSTRING = r"""
92 | Inputs:
93 | **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
94 | Indices of input sequence tokens in the vocabulary.
95 | To match pre-training, RoBERTa input sequence should be formatted with and tokens as follows:
96 |
97 | (a) For sequence pairs:
98 |
99 | ``tokens: Is this Jacksonville ? No it is not . ``
100 |
101 | (b) For single sequences:
102 |
103 | ``tokens: the dog is hairy . ``
104 |
105 | Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with
106 | the ``add_special_tokens`` parameter set to ``True``.
107 |
108 | RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
109 | the right rather than the left.
110 |
111 | See :func:`transformers.PreTrainedTokenizer.encode` and
112 | :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
113 | **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
114 | Mask to avoid performing attention on padding token indices.
115 | Mask values selected in ``[0, 1]``:
116 | ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
117 | **token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
118 | Optional segment token indices to indicate first and second portions of the inputs.
119 | This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it
120 | during finetuning.
121 | Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
122 | corresponds to a `sentence B` token
123 | (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
124 | **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
125 | Indices of positions of each input sequence tokens in the position embeddings.
126 | Selected in the range ``[0, config.max_position_embeddings - 1[``.
127 | **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
128 | Mask to nullify selected heads of the self-attention modules.
129 | Mask values selected in ``[0, 1]``:
130 | ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
131 | """
132 |
133 |
134 | @add_start_docstrings(
135 | "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
136 | ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
137 | class RobertaModel(BertModel):
138 | r"""
139 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
140 | **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
141 | Sequence of hidden-states at the output of the last layer of the model.
142 | **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
143 | Last layer hidden-state of the first token of the sequence (classification token)
144 | further processed by a Linear layer and a Tanh activation function. The Linear
145 | layer weights are trained from the next sentence prediction (classification)
146 | objective during Bert pretraining. This output is usually *not* a good summary
147 | of the semantic content of the input, you're often better with averaging or pooling
148 | the sequence of hidden-states for the whole input sequence.
149 | **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
150 | list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
151 | of shape ``(batch_size, sequence_length, hidden_size)``:
152 | Hidden-states of the model at the output of each layer plus the initial embedding outputs.
153 | **attentions**: (`optional`, returned when ``config.output_attentions=True``)
154 | list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
155 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
156 |
157 | Examples::
158 |
159 | tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
160 | model = RobertaModel.from_pretrained('roberta-base')
161 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
162 | outputs = model(input_ids)
163 | last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
164 |
165 | """
166 | config_class = RobertaConfig
167 | pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
168 | base_model_prefix = "roberta"
169 |
170 | def __init__(self, config):
171 | super(RobertaModel, self).__init__(config)
172 |
173 | self.embeddings = RobertaEmbeddings(config)
174 | self.init_weights()
175 |
176 | def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
177 | if input_ids[:, 0].sum().item() != 0:
178 | logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
179 | "This model requires special tokens in order to work. "
180 | "Please specify add_special_tokens=True in your tokenize.encode()"
181 | "or tokenizer.convert_tokens_to_ids().")
182 | return super(RobertaModel, self).forward(input_ids,
183 | attention_mask=attention_mask,
184 | token_type_ids=token_type_ids,
185 | position_ids=position_ids,
186 | head_mask=head_mask)
187 |
188 |
189 | @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
190 | ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
191 | class RobertaForMaskedLM(BertPreTrainedModel):
192 | r"""
193 | **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
194 | Labels for computing the masked language modeling loss.
195 | Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
196 | Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
197 | in ``[0, ..., config.vocab_size]``
198 |
199 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
200 | **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
201 | Masked language modeling loss.
202 | **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
203 | Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
204 | **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
205 | list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
206 | of shape ``(batch_size, sequence_length, hidden_size)``:
207 | Hidden-states of the model at the output of each layer plus the initial embedding outputs.
208 | **attentions**: (`optional`, returned when ``config.output_attentions=True``)
209 | list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
210 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
211 |
212 | Examples::
213 |
214 | tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
215 | model = RobertaForMaskedLM.from_pretrained('roberta-base')
216 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
217 | outputs = model(input_ids, masked_lm_labels=input_ids)
218 | loss, prediction_scores = outputs[:2]
219 |
220 | """
221 | config_class = RobertaConfig
222 | pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
223 | base_model_prefix = "roberta"
224 |
225 | def __init__(self, config):
226 | super(RobertaForMaskedLM, self).__init__(config)
227 |
228 | self.roberta = RobertaModel(config)
229 | self.lm_head = RobertaLMHead(config)
230 |
231 | self.init_weights()
232 | self.tie_weights()
233 |
234 | def tie_weights(self):
235 | """ Make sure we are sharing the input and output embeddings.
236 | Export to TorchScript can't handle parameter sharing so we are cloning them instead.
237 | """
238 | self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings)
239 |
240 | def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
241 | masked_lm_labels=None):
242 | outputs = self.roberta(input_ids,
243 | attention_mask=attention_mask,
244 | token_type_ids=token_type_ids,
245 | position_ids=position_ids,
246 | head_mask=head_mask)
247 | sequence_output = outputs[0]
248 | prediction_scores = self.lm_head(sequence_output)
249 |
250 | outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
251 |
252 | if masked_lm_labels is not None:
253 | loss_fct = CrossEntropyLoss(ignore_index=-1)
254 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
255 | outputs = (masked_lm_loss,) + outputs
256 |
257 | return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
258 |
259 |
260 | class RobertaLMHead(nn.Module):
261 | """Roberta Head for masked language modeling."""
262 |
263 | def __init__(self, config):
264 | super(RobertaLMHead, self).__init__()
265 | self.dense = nn.Linear(config.hidden_size, config.hidden_size)
266 | self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
267 |
268 | self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
269 | self.bias = nn.Parameter(torch.zeros(config.vocab_size))
270 |
271 | def forward(self, features, **kwargs):
272 | x = self.dense(features)
273 | x = gelu(x)
274 | x = self.layer_norm(x)
275 |
276 | # project back to size of vocabulary with bias
277 | x = self.decoder(x) + self.bias
278 |
279 | return x
280 |
281 |
282 | @add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
283 | on top of the pooled output) e.g. for GLUE tasks. """,
284 | ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
285 | class RobertaForSequenceClassification(BertPreTrainedModel):
286 | r"""
287 | **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
288 | Labels for computing the sequence classification/regression loss.
289 | Indices should be in ``[0, ..., config.num_labels]``.
290 | If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
291 | If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
292 |
293 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
294 | **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
295 | Classification (or regression if config.num_labels==1) loss.
296 | **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
297 | Classification (or regression if config.num_labels==1) scores (before SoftMax).
298 | **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
299 | list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
300 | of shape ``(batch_size, sequence_length, hidden_size)``:
301 | Hidden-states of the model at the output of each layer plus the initial embedding outputs.
302 | **attentions**: (`optional`, returned when ``config.output_attentions=True``)
303 | list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
304 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
305 |
306 | Examples::
307 |
308 | tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
309 | model = RobertaForSequenceClassification.from_pretrained('roberta-base')
310 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
311 | labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
312 | outputs = model(input_ids, labels=labels)
313 | loss, logits = outputs[:2]
314 |
315 | """
316 | config_class = RobertaConfig
317 | pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
318 | base_model_prefix = "roberta"
319 |
320 | def __init__(self, config):
321 | super(RobertaForSequenceClassification, self).__init__(config)
322 | self.num_labels = config.num_labels
323 |
324 | self.roberta = RobertaModel(config)
325 | self.classifier = RobertaClassificationHead(config)
326 |
327 | def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
328 | labels=None):
329 | outputs = self.roberta(input_ids,
330 | attention_mask=attention_mask,
331 | token_type_ids=token_type_ids,
332 | position_ids=position_ids,
333 | head_mask=head_mask)
334 | sequence_output = outputs[0]
335 | logits = self.classifier(sequence_output)
336 |
337 | outputs = (logits,) + outputs[2:]
338 | if labels is not None:
339 | if self.num_labels == 1:
340 | # We are doing regression
341 | loss_fct = MSELoss()
342 | loss = loss_fct(logits.view(-1), labels.view(-1))
343 | else:
344 | loss_fct = CrossEntropyLoss()
345 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
346 | outputs = (loss,) + outputs
347 |
348 | return outputs # (loss), logits, (hidden_states), (attentions)
349 |
350 |
351 | @add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of
352 | the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
353 | ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
354 | class RobertaForMultipleChoice(BertPreTrainedModel):
355 | r"""
356 | Inputs:
357 | **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
358 | Indices of input sequence tokens in the vocabulary.
359 | The second dimension of the input (`num_choices`) indicates the number of choices to score.
360 | To match pre-training, RoBerta input sequence should be formatted with [CLS] and [SEP] tokens as follows:
361 |
362 | (a) For sequence pairs:
363 |
364 | ``tokens: [CLS] is this jack ##son ##ville ? [SEP] [SEP] no it is not . [SEP]``
365 |
366 | ``token_type_ids: 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
367 |
368 | (b) For single sequences:
369 |
370 | ``tokens: [CLS] the dog is hairy . [SEP]``
371 |
372 | ``token_type_ids: 0 0 0 0 0 0 0``
373 |
374 | Indices can be obtained using :class:`transformers.BertTokenizer`.
375 | See :func:`transformers.PreTrainedTokenizer.encode` and
376 | :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
377 | **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
378 | Segment token indices to indicate first and second portions of the inputs.
379 | The second dimension of the input (`num_choices`) indicates the number of choices to score.
380 | Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
381 | **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
382 | Mask to avoid performing attention on padding token indices.
383 | The second dimension of the input (`num_choices`) indicates the number of choices to score.
384 | Mask values selected in ``[0, 1]``:
385 | ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
386 | **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
387 | Mask to nullify selected heads of the self-attention modules.
388 | Mask values selected in ``[0, 1]``:
389 | ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
390 | **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
391 | Labels for computing the multiple choice classification loss.
392 | Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
393 | of the input tensors. (see `input_ids` above)
394 |
395 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
396 | **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
397 | Classification loss.
398 | **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
399 | of the input tensors. (see `input_ids` above).
400 | Classification scores (before SoftMax).
401 | **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
402 | list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
403 | of shape ``(batch_size, sequence_length, hidden_size)``:
404 | Hidden-states of the model at the output of each layer plus the initial embedding outputs.
405 | **attentions**: (`optional`, returned when ``config.output_attentions=True``)
406 | list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
407 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
408 |
409 | Examples::
410 |
411 | tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
412 | model = RobertaForMultipleChoice.from_pretrained('roberta-base')
413 | choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
414 | input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
415 | labels = torch.tensor(1).unsqueeze(0) # Batch size 1
416 | outputs = model(input_ids, labels=labels)
417 | loss, classification_scores = outputs[:2]
418 |
419 | """
420 | config_class = RobertaConfig
421 | pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
422 | base_model_prefix = "roberta"
423 |
424 | def __init__(self, config):
425 | super(RobertaForMultipleChoice, self).__init__(config)
426 |
427 | self.roberta = RobertaModel(config)
428 | self.dropout = nn.Dropout(config.hidden_dropout_prob)
429 | self.classifier = nn.Linear(config.hidden_size, 1)
430 |
431 | self.init_weights()
432 |
433 | def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
434 | position_ids=None, head_mask=None):
435 | num_choices = input_ids.shape[1]
436 |
437 | flat_input_ids = input_ids.view(-1, input_ids.size(-1))
438 | flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
439 | flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
440 | flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
441 | outputs = self.roberta(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids,
442 | attention_mask=flat_attention_mask, head_mask=head_mask)
443 | pooled_output = outputs[1]
444 |
445 | pooled_output = self.dropout(pooled_output)
446 | logits = self.classifier(pooled_output)
447 | reshaped_logits = logits.view(-1, num_choices)
448 |
449 | outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here
450 |
451 | if labels is not None:
452 | loss_fct = CrossEntropyLoss()
453 | loss = loss_fct(reshaped_logits, labels)
454 | outputs = (loss,) + outputs
455 |
456 | return outputs # (loss), reshaped_logits, (hidden_states), (attentions)
457 |
458 |
459 | class RobertaClassificationHead(nn.Module):
460 | """Head for sentence-level classification tasks."""
461 |
462 | def __init__(self, config):
463 | super(RobertaClassificationHead, self).__init__()
464 | self.dense = nn.Linear(config.hidden_size, config.hidden_size)
465 | self.dropout = nn.Dropout(config.hidden_dropout_prob)
466 | self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
467 |
468 | def forward(self, features, **kwargs):
469 | x = features[:, 0, :] # take token (equiv. to [CLS])
470 | x = self.dropout(x)
471 | x = self.dense(x)
472 | x = torch.tanh(x)
473 | x = self.dropout(x)
474 | x = self.out_proj(x)
475 | return x
476 |
--------------------------------------------------------------------------------
/run_glue.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
17 |
18 | from __future__ import absolute_import, division, print_function
19 | from sklearn.metrics import roc_curve, auc
20 | import math
21 | import argparse
22 | import glob
23 | import logging
24 | import os
25 | import random
26 |
27 | import numpy as np
28 | import torch
29 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
30 | TensorDataset)
31 | from torch.utils.data.distributed import DistributedSampler
32 |
33 | try:
34 | from torch.utils.tensorboard import SummaryWriter
35 | except:
36 | from tensorboardX import SummaryWriter
37 |
38 | from tqdm import tqdm, trange
39 | from file_utils1 import WEIGHTS_NAME
40 | from configuration_bert1 import BertConfig
41 | from modeling_bert1 import BertForMultiSequenceClassification
42 | from tokenization_bert1 import BertTokenizer
43 | from configuration_roberta1 import RobertaConfig
44 | from modeling_roberta1 import RobertaForSequenceClassification
45 | from tokenization_roberta1 import RobertaTokenizer
46 | # from transformers import (WEIGHTS_NAME, BertConfig,
47 | # BertForSequenceClassification, BertTokenizer,
48 | # RobertaConfig,
49 | # RobertaForSequenceClassification,
50 | # RobertaTokenizer,
51 | # XLMConfig, XLMForSequenceClassification,
52 | # XLMTokenizer, XLNetConfig,
53 | # XLNetForSequenceClassification,
54 | # XLNetTokenizer,
55 | # DistilBertConfig,
56 | # DistilBertForSequenceClassification,
57 | # DistilBertTokenizer)
58 | from optimization1 import AdamW,WarmupLinearSchedule
59 | # from transformers import AdamW, WarmupLinearSchedule
60 | from metrics1 import glue_compute_metrics as compute_metrics
61 | # from transformers import glue_compute_metrics as compute_metrics
62 | # from transformers import glue_output_modes as output_modes
63 | from glue1 import glue_output_modes as output_modes
64 | # from transformers import glue_processors as processors
65 | from glue1 import glue_processors as processors
66 | from glue1 import glue_convert_examples_to_features as convert_examples_to_features
67 | # from transformers import glue_convert_examples_to_features as convert_examples_to_features
68 | def sigmoid(x):
69 | return 1. / (1 + np.exp(-x))
70 | logger = logging.getLogger(__name__)
71 |
72 | ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig,
73 | RobertaConfig)), ())
74 |
75 | MODEL_CLASSES = {
76 | 'bert': (BertConfig, BertForMultiSequenceClassification, BertTokenizer),
77 | # 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
78 | # 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
79 | 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
80 | # 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
81 | }
82 |
83 |
84 | def set_seed(args):
85 | random.seed(args.seed)
86 | np.random.seed(args.seed)
87 | torch.manual_seed(args.seed)
88 | if args.n_gpu > 0:
89 | torch.cuda.manual_seed_all(args.seed)
90 |
91 | # def softmax(inMatrix):
92 | # """
93 | # softmax计算公式函数
94 | # :param inMatrix: 矩阵数据
95 | # :return:
96 | # """
97 | # m,n = np.shape(inMatrix) #得到m,n(行,列)
98 | # outMatrix = np.mat(np.zeros((m,n)))#mat生成数组
99 | # for i in range(m):
100 | # soft_sum = 0
101 | # for idx in range(0,n):
102 | # outMatrix[i,idx] = math.exp(inMatrix[i,idx]) #求幂运算,取e为底的指数计算变成非负
103 | # soft_sum +=outMatrix[i,idx] #求和运算
104 | # for idx in range(0,n):
105 | # outMatrix[i,idx] = outMatrix[i,idx] /soft_sum #然后除以所有项之后进行归一化
106 | # return outMatrix
107 |
108 | def train(args, train_dataset, model, tokenizer):
109 | """ Train the model """
110 | if args.local_rank in [-1, 0]:
111 | tb_writer = SummaryWriter()
112 |
113 | # args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
114 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
115 | train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
116 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
117 |
118 | if args.max_steps > 0:
119 | t_total = args.max_steps
120 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
121 | else:
122 | t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
123 |
124 | # Prepare optimizer and schedule (linear warmup and decay)
125 | no_decay = ['bias', 'LayerNorm.weight']
126 | optimizer_grouped_parameters = [
127 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
128 | 'weight_decay': args.weight_decay},
129 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
130 | ]
131 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
132 | scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.06, t_total=t_total)
133 | if args.fp16:
134 | try:
135 | from apex import amp
136 | except ImportError:
137 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
138 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
139 |
140 | # multi-gpu training (should be after apex fp16 initialization)
141 | if args.n_gpu > 1:
142 | model = torch.nn.DataParallel(model)
143 |
144 | # Distributed training (should be after apex fp16 initialization)
145 | if args.local_rank != -1:
146 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
147 | output_device=args.local_rank,
148 | find_unused_parameters=True)
149 |
150 | # Train!
151 | logger.info("***** Running training *****")
152 | logger.info(" Num examples = %d", len(train_dataset))
153 | logger.info(" Num Epochs = %d", args.num_train_epochs)
154 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
155 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
156 | args.train_batch_size * args.gradient_accumulation_steps * (
157 | torch.distributed.get_world_size() if args.local_rank != -1 else 1))
158 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
159 | logger.info(" Total optimization steps = %d", t_total)
160 |
161 | global_step = 0
162 | tr_loss, logging_loss = 0.0, 0.0
163 | model.zero_grad()
164 | train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
165 | set_seed(args) # Added here for reproductibility (even between python 2 and 3)
166 |
167 | for _ in train_iterator:
168 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
169 | for step, batch in enumerate(epoch_iterator):
170 | model.train()
171 | batch = tuple(t.to(args.device) for t in batch)
172 | inputs = {'input_ids': batch[0],
173 | 'attention_mask': batch[1],
174 | 'labels': batch[3]}
175 | if args.model_type != 'distilbert':
176 | inputs['token_type_ids'] = batch[2] if args.model_type in ['bert',
177 | 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids
178 | outputs = model(**inputs)
179 | loss = outputs[0] # model outputs are always tuple in transformers (see doc)
180 |
181 | if args.n_gpu > 1:
182 | loss = loss.mean() # mean() to average on multi-gpu parallel training
183 | if args.gradient_accumulation_steps > 1:
184 | loss = loss / args.gradient_accumulation_steps
185 |
186 | if args.fp16:
187 | with amp.scale_loss(loss, optimizer) as scaled_loss:
188 | scaled_loss.backward()
189 | else:
190 | loss.backward()
191 | tr_loss += loss.item()
192 | if (step + 1) % args.gradient_accumulation_steps == 0 and not args.tpu:
193 | if args.fp16:
194 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
195 | else:
196 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
197 |
198 | optimizer.step()
199 | scheduler.step() # Update learning rate schedule
200 | model.zero_grad()
201 | global_step += 1
202 |
203 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
204 | # Log metrics
205 | if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
206 | results = evaluate(args, model, tokenizer)
207 | for key, value in results.items():
208 | tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
209 | tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
210 | tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step)
211 | logging_loss = tr_loss
212 |
213 | if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
214 | # Save model checkpoint
215 | output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
216 | if not os.path.exists(output_dir):
217 | os.makedirs(output_dir)
218 | model_to_save = model.module if hasattr(model,
219 | 'module') else model # Take care of distributed/parallel training
220 | model_to_save.save_pretrained(output_dir)
221 | torch.save(args, os.path.join(output_dir, 'training_args.bin'))
222 | logger.info("Saving model checkpoint to %s", output_dir)
223 |
224 | if args.tpu:
225 | args.xla_model.optimizer_step(optimizer, barrier=True)
226 | model.zero_grad()
227 | global_step += 1
228 |
229 | if args.max_steps > 0 and global_step > args.max_steps:
230 | epoch_iterator.close()
231 | break
232 | if args.max_steps > 0 and global_step > args.max_steps:
233 | train_iterator.close()
234 | break
235 | eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
236 | eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (
237 | args.output_dir,)
238 |
239 | if args.local_rank in [-1, 0]:
240 | tb_writer.close()
241 |
242 | return global_step, tr_loss / global_step
243 |
244 |
245 | def evaluate(args, model, tokenizer, prefix=""):
246 | # Loop to handle MNLI double evaluation (matched, mis-matched)
247 | eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
248 | eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
249 |
250 | results = {}
251 | for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
252 | eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
253 |
254 | if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
255 | os.makedirs(eval_output_dir)
256 |
257 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
258 | # Note that DistributedSampler samples randomly
259 | eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
260 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
261 |
262 | # Eval!
263 | logger.info("***** Running evaluation {} *****".format(prefix))
264 | logger.info(" Num examples = %d", len(eval_dataset))
265 | logger.info(" Batch size = %d", args.eval_batch_size)
266 | eval_loss = 0.0
267 | nb_eval_steps = 0
268 | preds = None
269 | out_label_ids = None
270 | all_logits=[]
271 | for batch in tqdm(eval_dataloader, desc="Evaluating"):
272 | model.eval()
273 | batch = tuple(t.to(args.device) for t in batch)
274 |
275 | with torch.no_grad():
276 | inputs = {'input_ids': batch[0],
277 | 'attention_mask': batch[1],
278 | 'labels': batch[3]}
279 | if args.model_type != 'distilbert':
280 | inputs['token_type_ids'] = batch[2] if args.model_type in ['bert',
281 | 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids
282 | outputs = model(**inputs)
283 | tmp_eval_loss, logits = outputs[:2]
284 | tem_logits=logits
285 | Array_logits=np.array((tem_logits.cpu()))
286 | all_logits.append(Array_logits[0].tolist())
287 | all_logits.append(Array_logits[1].tolist())
288 | all_logits.append(Array_logits[2].tolist())
289 | eval_loss += tmp_eval_loss.mean().item()
290 | nb_eval_steps += 1
291 | if preds is None:
292 | preds = logits.detach().cpu().numpy()
293 | out_label_ids = inputs['labels'].detach().cpu().numpy()
294 | else:
295 | preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
296 | out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
297 | # for x in range(10,98):
298 | Q=0.9
299 | tem_logits_NEW=[]
300 | tem_list=[]
301 | all_logits=np.array(all_logits)
302 | for i in range(399):
303 | for j in range(16):
304 | tem_list.append(sigmoid(all_logits[i][j]))
305 | tem_logits_NEW.append(tem_list)
306 | tem_list=[]
307 | # tem_logits_NEW=np.array(tem_logits_NEW)
308 | # tem_logits_NEW=softmax(tem_logits_NEW)
309 | # tem_logits_NEW=tem_logits_NEW.getA().tolist()
310 |
311 | for i in range(399):
312 | for j in range(16):
313 | if tem_logits_NEW[i][j] > Q:
314 | tem_logits_NEW[i][j]=1
315 | else: tem_logits_NEW[i][j]=0
316 | tem_logits_NEW=np.array(tem_logits_NEW)
317 | count=0
318 | for i in range(399):
319 | tem_1=tem_logits_NEW[i]
320 | tem_2=out_label_ids[i]
321 | w=0
322 | k=0
323 | z=0
324 | for j in range(16):
325 | if tem_1[j]==1:
326 | w=w+1
327 | for j in range(16):
328 | if tem_2[j]==1:
329 | k=k+1
330 | for j in range(16):
331 | if tem_1[j]==1 and tem_2[j]==1:
332 | z=z+1
333 | N = z*1.0/(w+0.00000001)
334 | M = z*1.0/(k+0.00000001)
335 | if N+M == 0:
336 | H=0
337 | else:
338 | H = (2*N*M)/(N+M)
339 | count=count+H
340 | F1=count/399
341 | print("***********************************************")
342 | print(F1)
343 | print("***********************************************")
344 |
345 | # all_logits = np.array(all_logits)
346 | # eval_loss = eval_loss / nb_eval_steps
347 | # fpr = dict()
348 | # tpr = dict()
349 | # roc_auc = dict()
350 | # for i in range(16):
351 | # fpr[i], tpr[i], _ = roc_curve(out_label_ids[:, i], all_logits[:, i])
352 | # roc_auc[i] = auc(fpr[i], tpr[i])
353 | # fpr["micro"], tpr["micro"], _ = roc_curve(out_label_ids.ravel(), all_logits.ravel())
354 | # roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
355 | # print(roc_auc)
356 | H = {'F1': F1}
357 | return H
358 | # if args.output_mode == "MultiLabelclassification":
359 | # output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
360 | # with open(output_eval_file, "w") as writer:
361 | # logger.info("***** Eval results {} *****".format(prefix))
362 | # # for key in sorted(result.keys()):
363 | # logger.info(" %s ", str(eval_loss))
364 | # writer.write("%s\n" % (str(eval_loss)))
365 | # return eval_loss
366 | # if args.output_mode == "classification":
367 | # preds = np.argmax(preds, axis=1)
368 | # elif args.output_mode == "regression":
369 | # preds = np.squeeze(preds)
370 | # result = compute_metrics(eval_task, preds, out_label_ids)
371 | # results.update(result)
372 | #
373 | # output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
374 | # with open(output_eval_file, "w") as writer:
375 | # logger.info("***** Eval results {} *****".format(prefix))
376 | # for key in sorted(result.keys()):
377 | # logger.info(" %s = %s", key, str(result[key]))
378 | # writer.write("%s = %s\n" % (key, str(result[key])))
379 | #
380 | # return results
381 |
382 |
383 | def load_and_cache_examples(args, task, tokenizer, evaluate=False):
384 | if args.local_rank not in [-1, 0] and not evaluate:
385 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
386 |
387 | processor = processors[task]()
388 | output_mode = output_modes[task]
389 | # Load data features from cache or dataset file
390 | cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
391 | 'dev' if evaluate else 'train',
392 | list(filter(None, args.model_name_or_path.split('/'))).pop(),
393 | str(args.max_seq_length),
394 | str(task)))
395 | if os.path.exists(cached_features_file) and not args.overwrite_cache:
396 | logger.info("Loading features from cached file %s", cached_features_file)
397 | features = torch.load(cached_features_file)
398 | else:
399 | logger.info("Creating features from dataset file at %s", args.data_dir)
400 | label_list = processor.get_labels()
401 | if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
402 | # HACK(label indices are swapped in RoBERTa pretrained model)
403 | label_list[1], label_list[2] = label_list[2], label_list[1]
404 | examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(
405 | args.data_dir)
406 | features = convert_examples_to_features(examples,
407 | tokenizer,
408 | label_list=label_list,
409 | max_length=args.max_seq_length,
410 | output_mode=output_mode,
411 | pad_on_left=bool(args.model_type in ['xlnet']),
412 | # pad on the left for xlnet
413 | pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
414 | pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
415 | )
416 | if args.local_rank in [-1, 0]:
417 | logger.info("Saving features into cached file %s", cached_features_file)
418 | torch.save(features, cached_features_file)
419 |
420 | if args.local_rank == 0 and not evaluate:
421 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
422 |
423 | # Convert to Tensors and build dataset
424 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
425 | all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
426 | all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
427 | if output_mode == "classification":
428 | all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
429 | elif output_mode == "regression":
430 | all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
431 | elif output_mode == "MultiLabelclassification":
432 | all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
433 |
434 | dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
435 | return dataset
436 |
437 |
438 | def main():
439 | parser = argparse.ArgumentParser()
440 |
441 | ## Required parameters
442 | parser.add_argument("--data_dir", default="/home/msqin/bert/bert1/data_mutil", type=str,
443 | help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
444 | parser.add_argument("--model_type", default="bert", type=str,
445 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
446 | parser.add_argument("--model_name_or_path", default="/home/msqin/bert/bert1/uncased_L-12_H-768_A-12/pytorch_model.bin", type=str,
447 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
448 | ALL_MODELS))
449 | parser.add_argument("--task_name", default="multilabel", type=str,
450 | help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
451 | parser.add_argument("--output_dir", default="/home/msqin/bert/bert1/tmp/new_output", type=str,
452 | help="The output directory where the model predictions and checkpoints will be written.")
453 |
454 | ## Other parameters
455 | parser.add_argument("--config_name", default="/home/msqin/bert/bert1/uncased_L-12_H-768_A-12/bert_config.json", type=str,
456 | help="Pretrained config name or path if not the same as model_name")
457 | parser.add_argument("--tokenizer_name", default="/home/msqin/bert/bert1/uncased_L-12_H-768_A-12/vocab.txt", type=str,
458 | help="Pretrained tokenizer name or path if not the same as model_name")
459 | parser.add_argument("--cache_dir", default="", type=str,
460 | help="Where do you want to store the pre-trained models downloaded from s3")
461 | parser.add_argument("--max_seq_length", default=32, type=int,
462 | help="The maximum total input sequence length after tokenization. Sequences longer "
463 | "than this will be truncated, sequences shorter will be padded.")
464 | parser.add_argument("--do_train", default=True,
465 | help="Whether to run training.")
466 | parser.add_argument("--do_eval", default=True,
467 | help="Whether to run eval on the dev set.")
468 | parser.add_argument("--evaluate_during_training", default=True,
469 | help="Rul evaluation during training at each logging step.")
470 | parser.add_argument("--do_lower_case", default=True,
471 | help="Set this flag if you are using an uncased model.")
472 |
473 | parser.add_argument("--per_gpu_train_batch_size", default=32,type=int,
474 | help="Batch size per GPU/CPU for training.")
475 | parser.add_argument("--per_gpu_eval_batch_size", default=3, type=int,
476 | help="Batch size per GPU/CPU for evaluation.")
477 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
478 | help="Number of updates steps to accumulate before performing a backward/update pass.")
479 | parser.add_argument("--learning_rate", default=2e-5, type=float,
480 | help="The initial learning rate for Adam.")
481 | parser.add_argument("--weight_decay", default=0.0, type=float,
482 | help="Weight deay if we apply some.")
483 | parser.add_argument("--adam_epsilon", default=1e-8, type=float,
484 | help="Epsilon for Adam optimizer.")
485 | parser.add_argument("--max_grad_norm", default=1.0, type=float,
486 | help="Max gradient norm.")
487 | parser.add_argument("--num_train_epochs", default=20.0, type=float,
488 | help="Total number of training epochs to perform.")
489 | parser.add_argument("--max_steps", default=-1, type=int,
490 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
491 | parser.add_argument("--warmup_steps", default=0, type=int,
492 | help="Linear warmup over warmup_steps.")
493 |
494 | parser.add_argument('--logging_steps', type=int, default=500,
495 | help="Log every X updates steps.")
496 | parser.add_argument('--save_steps', type=int, default=500,
497 | help="Save checkpoint every X updates steps.")
498 | parser.add_argument("--eval_all_checkpoints", default=True,
499 | help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
500 | parser.add_argument("--no_cuda", action='store_true',
501 | help="Avoid using CUDA when available")
502 | parser.add_argument('--overwrite_output_dir', action='store_true',
503 | help="Overwrite the content of the output directory")
504 | parser.add_argument('--overwrite_cache', action='store_true',
505 | help="Overwrite the cached training and evaluation sets")
506 | parser.add_argument('--seed', type=int, default=42,
507 | help="random seed for initialization")
508 |
509 | parser.add_argument('--tpu', action='store_true',
510 | help="Whether to run on the TPU defined in the environment variables")
511 | parser.add_argument('--tpu_ip_address', type=str, default='',
512 | help="TPU IP address if none are set in the environment variables")
513 | parser.add_argument('--tpu_name', type=str, default='',
514 | help="TPU name if none are set in the environment variables")
515 | parser.add_argument('--xrt_tpu_config', type=str, default='',
516 | help="XRT TPU config if none are set in the environment variables")
517 |
518 | parser.add_argument('--fp16', action='store_true',
519 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
520 | parser.add_argument('--fp16_opt_level', type=str, default='O1',
521 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
522 | "See details at https://nvidia.github.io/apex/amp.html")
523 | parser.add_argument("--local_rank", type=int, default=-1,
524 | help="For distributed training: local_rank")
525 | parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
526 | parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
527 | args = parser.parse_args()
528 |
529 | if os.path.exists(args.output_dir) and os.listdir(
530 | args.output_dir) and args.do_train and not args.overwrite_output_dir:
531 | raise ValueError(
532 | "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
533 | args.output_dir))
534 |
535 | # Setup distant debugging if needed
536 | if args.server_ip and args.server_port:
537 | # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
538 | import ptvsd
539 | print("Waiting for debugger attach")
540 | ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
541 | ptvsd.wait_for_attach()
542 |
543 | # Setup CUDA, GPU & distributed training
544 | if args.local_rank == -1 or args.no_cuda:
545 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
546 | args.n_gpu = torch.cuda.device_count()
547 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
548 | torch.cuda.set_device(args.local_rank)
549 | device = torch.device("cuda", args.local_rank)
550 | torch.distributed.init_process_group(backend='nccl')
551 | args.n_gpu = 1
552 | args.device = device
553 |
554 | if args.tpu:
555 | if args.tpu_ip_address:
556 | os.environ["TPU_IP_ADDRESS"] = args.tpu_ip_address
557 | if args.tpu_name:
558 | os.environ["TPU_NAME"] = args.tpu_name
559 | if args.xrt_tpu_config:
560 | os.environ["XRT_TPU_CONFIG"] = args.xrt_tpu_config
561 |
562 | assert "TPU_IP_ADDRESS" in os.environ
563 | assert "TPU_NAME" in os.environ
564 | assert "XRT_TPU_CONFIG" in os.environ
565 |
566 | import torch_xla
567 | import torch_xla.core.xla_model as xm
568 | args.device = xm.xla_device()
569 | args.xla_model = xm
570 |
571 | # Setup logging
572 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
573 | datefmt='%m/%d/%Y %H:%M:%S',
574 | level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
575 | logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
576 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
577 |
578 | # Set seed
579 | set_seed(args)
580 |
581 | # Prepare GLUE task
582 | args.task_name = args.task_name.lower()
583 | if args.task_name not in processors:
584 | raise ValueError("Task not found: %s" % (args.task_name))
585 | processor = processors[args.task_name]()
586 | args.output_mode = output_modes[args.task_name]
587 | label_list = processor.get_labels()
588 | num_labels = len(label_list)
589 |
590 | # Load pretrained model and tokenizer
591 | if args.local_rank not in [-1, 0]:
592 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
593 |
594 | args.model_type = args.model_type.lower()
595 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
596 | config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
597 | num_labels=num_labels, finetuning_task=args.task_name)
598 | tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
599 | do_lower_case=args.do_lower_case)
600 | model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path),
601 | config=config)
602 |
603 | if args.local_rank == 0:
604 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
605 |
606 | model.to(args.device)
607 |
608 | logger.info("Training/evaluation parameters %s", args)
609 |
610 | # Training
611 | if args.do_train:
612 | train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
613 | global_step, tr_loss = train(args, train_dataset, model, tokenizer)
614 | logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
615 |
616 | # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
617 | if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and not args.tpu:
618 | # Create output directory if needed
619 | if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
620 | os.makedirs(args.output_dir)
621 |
622 | logger.info("Saving model checkpoint to %s", args.output_dir)
623 | # Save a trained model, configuration and tokenizer using `save_pretrained()`.
624 | # They can then be reloaded using `from_pretrained()`
625 | model_to_save = model.module if hasattr(model,
626 | 'module') else model # Take care of distributed/parallel training
627 | model_to_save.save_pretrained(args.output_dir)
628 | tokenizer.save_pretrained(args.output_dir)
629 |
630 | # Good practice: save your training arguments together with the trained model
631 | torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
632 |
633 | # Load a trained model and vocabulary that you have fine-tuned
634 | model = model_class.from_pretrained(args.output_dir)
635 | tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
636 | model.to(args.device)
637 |
638 | # Evaluation
639 | results = {}
640 | if args.do_eval and args.local_rank in [-1, 0]:
641 | tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
642 | checkpoints = [args.output_dir]
643 | if args.eval_all_checkpoints:
644 | checkpoints = list(
645 | os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
646 | logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
647 | logger.info("Evaluate the following checkpoints: %s", checkpoints)
648 | for checkpoint in checkpoints:
649 | # checkpoint="/home/msqin/bert/bert1/tmp/My_output"
650 | global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
651 | prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
652 |
653 | model = model_class.from_pretrained(checkpoint)
654 | model.to(args.device)
655 | result = evaluate(args, model, tokenizer, prefix=prefix)
656 | result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
657 | results.update(result)
658 | print(results)
659 | return results
660 |
661 |
662 | if __name__ == "__main__":
663 | main()
--------------------------------------------------------------------------------
/modeling_utils1.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """PyTorch BERT model."""
17 |
18 | from __future__ import (absolute_import, division, print_function,
19 | unicode_literals)
20 |
21 | import copy
22 | import json
23 | import logging
24 | import os
25 | from io import open
26 |
27 | import six
28 | import torch
29 | from torch import nn
30 | from torch.nn import CrossEntropyLoss
31 | from torch.nn import functional as F
32 |
33 | from configuration_utils1 import PretrainedConfig
34 | from file_utils1 import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
35 |
36 | logger = logging.getLogger(__name__)
37 |
38 |
39 | try:
40 | from torch.nn import Identity
41 | except ImportError:
42 | # Older PyTorch compatibility
43 | class Identity(nn.Module):
44 | r"""A placeholder identity operator that is argument-insensitive.
45 | """
46 | def __init__(self, *args, **kwargs):
47 | super(Identity, self).__init__()
48 |
49 | def forward(self, input):
50 | return input
51 |
52 | class PreTrainedModel(nn.Module):
53 | r""" Base class for all models.
54 |
55 | :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
56 | as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
57 |
58 | Class attributes (overridden by derived classes):
59 | - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
60 | - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
61 | - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
62 |
63 | - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
64 | - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
65 | - ``path``: a path (string) to the TensorFlow checkpoint.
66 |
67 | - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
68 | """
69 | config_class = None
70 | pretrained_model_archive_map = {}
71 | load_tf_weights = lambda model, config, path: None
72 | base_model_prefix = ""
73 |
74 | def __init__(self, config, *inputs, **kwargs):
75 | super(PreTrainedModel, self).__init__()
76 | if not isinstance(config, PretrainedConfig):
77 | raise ValueError(
78 | "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
79 | "To create a model from a pretrained model use "
80 | "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
81 | self.__class__.__name__, self.__class__.__name__
82 | ))
83 | # Save config in model
84 | self.config = config
85 |
86 | def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
87 | """ Build a resized Embedding Module from a provided token Embedding Module.
88 | Increasing the size will add newly initialized vectors at the end
89 | Reducing the size will remove vectors from the end
90 |
91 | Args:
92 | new_num_tokens: (`optional`) int
93 | New number of tokens in the embedding matrix.
94 | Increasing the size will add newly initialized vectors at the end
95 | Reducing the size will remove vectors from the end
96 | If not provided or None: return the provided token Embedding Module.
97 | Return: ``torch.nn.Embeddings``
98 | Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
99 | """
100 | if new_num_tokens is None:
101 | return old_embeddings
102 |
103 | old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
104 | if old_num_tokens == new_num_tokens:
105 | return old_embeddings
106 |
107 | # Build new embeddings
108 | new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
109 | new_embeddings.to(old_embeddings.weight.device)
110 |
111 | # initialize all new embeddings (in particular added tokens)
112 | self._init_weights(new_embeddings)
113 |
114 | # Copy word embeddings from the previous weights
115 | num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
116 | new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
117 |
118 | return new_embeddings
119 |
120 | def _tie_or_clone_weights(self, first_module, second_module):
121 | """ Tie or clone module weights depending of weither we are using TorchScript or not
122 | """
123 | if self.config.torchscript:
124 | first_module.weight = nn.Parameter(second_module.weight.clone())
125 | else:
126 | first_module.weight = second_module.weight
127 |
128 | if hasattr(first_module, 'bias') and first_module.bias is not None:
129 | first_module.bias.data = torch.nn.functional.pad(
130 | first_module.bias.data,
131 | (0, first_module.weight.shape[0] - first_module.bias.shape[0]),
132 | 'constant',
133 | 0
134 | )
135 |
136 | def resize_token_embeddings(self, new_num_tokens=None):
137 | """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
138 | Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
139 |
140 | Arguments:
141 |
142 | new_num_tokens: (`optional`) int:
143 | New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
144 | If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
145 |
146 | Return: ``torch.nn.Embeddings``
147 | Pointer to the input tokens Embeddings Module of the model
148 | """
149 | base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
150 | model_embeds = base_model._resize_token_embeddings(new_num_tokens)
151 | if new_num_tokens is None:
152 | return model_embeds
153 |
154 | # Update base model and current model config
155 | self.config.vocab_size = new_num_tokens
156 | base_model.vocab_size = new_num_tokens
157 |
158 | # Tie weights again if needed
159 | if hasattr(self, 'tie_weights'):
160 | self.tie_weights()
161 |
162 | return model_embeds
163 |
164 | def init_weights(self):
165 | """ Initialize and prunes weights if needed. """
166 | # Initialize weights
167 | self.apply(self._init_weights)
168 |
169 | # Prune heads if needed
170 | if self.config.pruned_heads:
171 | self.prune_heads(self.config.pruned_heads)
172 |
173 | def prune_heads(self, heads_to_prune):
174 | """ Prunes heads of the base model.
175 |
176 | Arguments:
177 |
178 | heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
179 | E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
180 | """
181 | base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
182 |
183 | # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
184 | for layer, heads in heads_to_prune.items():
185 | union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
186 | self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON
187 |
188 | base_model._prune_heads(heads_to_prune)
189 |
190 | def save_pretrained(self, save_directory):
191 | """ Save a model and its configuration file to a directory, so that it
192 | can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
193 | """
194 | assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
195 |
196 | # Only save the model it-self if we are using distributed training
197 | model_to_save = self.module if hasattr(self, 'module') else self
198 |
199 | # Save configuration file
200 | model_to_save.config.save_pretrained(save_directory)
201 |
202 | # If we save using the predefined names, we can load using `from_pretrained`
203 | output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
204 | torch.save(model_to_save.state_dict(), output_model_file)
205 | logger.info("Model weights saved in {}".format(output_model_file))
206 |
207 | @classmethod
208 | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
209 | r"""Instantiate a pretrained pytorch model from a pre-trained model configuration.
210 |
211 | The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
212 | To train the model, you should first set it back in training mode with ``model.train()``
213 |
214 | The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
215 | It is up to you to train those weights with a downstream fine-tuning task.
216 |
217 | The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.
218 |
219 | Parameters:
220 | pretrained_model_name_or_path: either:
221 |
222 | - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
223 | - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
224 | - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
225 | - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
226 |
227 | model_args: (`optional`) Sequence of positional arguments:
228 | All remaning positional arguments will be passed to the underlying model's ``__init__`` method
229 |
230 | config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
231 | Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
232 |
233 | - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
234 | - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
235 | - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
236 |
237 | state_dict: (`optional`) dict:
238 | an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
239 | This option can be used if you want to create a model from a pretrained configuration but load your own weights.
240 | In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
241 |
242 | cache_dir: (`optional`) string:
243 | Path to a directory in which a downloaded pre-trained model
244 | configuration should be cached if the standard cache should not be used.
245 |
246 | force_download: (`optional`) boolean, default False:
247 | Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
248 |
249 | proxies: (`optional`) dict, default None:
250 | A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
251 | The proxies are used on each request.
252 |
253 | output_loading_info: (`optional`) boolean:
254 | Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
255 |
256 | kwargs: (`optional`) Remaining dictionary of keyword arguments:
257 | Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
258 |
259 | - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
260 | - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
261 |
262 | Examples::
263 |
264 | model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
265 | model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
266 | model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
267 | assert model.config.output_attention == True
268 | # Loading from a TF checkpoint file instead of a PyTorch model (slower)
269 | config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
270 | model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
271 |
272 | """
273 | config = kwargs.pop('config', None)
274 | state_dict = kwargs.pop('state_dict', None)
275 | cache_dir = kwargs.pop('cache_dir', None)
276 | from_tf = kwargs.pop('from_tf', False)
277 | force_download = kwargs.pop('force_download', False)
278 | proxies = kwargs.pop('proxies', None)
279 | output_loading_info = kwargs.pop('output_loading_info', False)
280 |
281 | # Load config
282 | if config is None:
283 | config, model_kwargs = cls.config_class.from_pretrained(
284 | pretrained_model_name_or_path, *model_args,
285 | cache_dir=cache_dir, return_unused_kwargs=True,
286 | force_download=force_download,
287 | **kwargs
288 | )
289 | else:
290 | model_kwargs = kwargs
291 |
292 | # Load model
293 | if pretrained_model_name_or_path is not None:
294 | if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
295 | archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
296 | elif os.path.isdir(pretrained_model_name_or_path):
297 | if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")):
298 | # Load from a TF 1.0 checkpoint
299 | archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
300 | elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
301 | # Load from a TF 2.0 checkpoint
302 | archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
303 | elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
304 | # Load from a PyTorch checkpoint
305 | archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
306 | else:
307 | raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
308 | [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
309 | pretrained_model_name_or_path))
310 | elif os.path.isfile(pretrained_model_name_or_path):
311 | archive_file = pretrained_model_name_or_path
312 | else:
313 | assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path)
314 | archive_file = pretrained_model_name_or_path + ".index"
315 |
316 | # redirect to the cache, if necessary
317 | try:
318 | resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
319 | except EnvironmentError:
320 | if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
321 | msg = "Couldn't reach server at '{}' to download pretrained weights.".format(
322 | archive_file)
323 | else:
324 | msg = "Model name '{}' was not found in model name list ({}). " \
325 | "We assumed '{}' was a path or url to model weight files named one of {} but " \
326 | "couldn't find any such file at this path or url.".format(
327 | pretrained_model_name_or_path,
328 | ', '.join(cls.pretrained_model_archive_map.keys()),
329 | archive_file,
330 | [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME])
331 | raise EnvironmentError(msg)
332 |
333 | if resolved_archive_file == archive_file:
334 | logger.info("loading weights file {}".format(archive_file))
335 | else:
336 | logger.info("loading weights file {} from cache at {}".format(
337 | archive_file, resolved_archive_file))
338 | else:
339 | resolved_archive_file = None
340 |
341 | # Instantiate model.
342 | model = cls(config, *model_args, **model_kwargs)
343 |
344 | if state_dict is None and not from_tf:
345 | state_dict = torch.load(resolved_archive_file, map_location='cpu')
346 |
347 | missing_keys = []
348 | unexpected_keys = []
349 | error_msgs = []
350 |
351 | if from_tf:
352 | if resolved_archive_file.endswith('.index'):
353 | # Load from a TensorFlow 1.X checkpoint - provided by original authors
354 | model = cls.load_tf_weights(model, config, resolved_archive_file[:-6]) # Remove the '.index'
355 | else:
356 | # Load from our TensorFlow 2.0 checkpoints
357 | try:
358 | from transformers import load_tf2_checkpoint_in_pytorch_model
359 | model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True)
360 | except ImportError as e:
361 | logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
362 | "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
363 | raise e
364 | else:
365 | # Convert old format to new format if needed from a PyTorch state_dict
366 | old_keys = []
367 | new_keys = []
368 | for key in state_dict.keys():
369 | new_key = None
370 | if 'gamma' in key:
371 | new_key = key.replace('gamma', 'weight')
372 | if 'beta' in key:
373 | new_key = key.replace('beta', 'bias')
374 | if new_key:
375 | old_keys.append(key)
376 | new_keys.append(new_key)
377 | for old_key, new_key in zip(old_keys, new_keys):
378 | state_dict[new_key] = state_dict.pop(old_key)
379 |
380 | # copy state_dict so _load_from_state_dict can modify it
381 | metadata = getattr(state_dict, '_metadata', None)
382 | state_dict = state_dict.copy()
383 | if metadata is not None:
384 | state_dict._metadata = metadata
385 |
386 | def load(module, prefix=''):
387 | local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
388 | module._load_from_state_dict(
389 | state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
390 | for name, child in module._modules.items():
391 | if child is not None:
392 | load(child, prefix + name + '.')
393 |
394 | # Make sure we are able to load base models as well as derived models (with heads)
395 | start_prefix = ''
396 | model_to_load = model
397 | if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
398 | start_prefix = cls.base_model_prefix + '.'
399 | if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
400 | model_to_load = getattr(model, cls.base_model_prefix)
401 |
402 | load(model_to_load, prefix=start_prefix)
403 | if len(missing_keys) > 0:
404 | logger.info("Weights of {} not initialized from pretrained model: {}".format(
405 | model.__class__.__name__, missing_keys))
406 | if len(unexpected_keys) > 0:
407 | logger.info("Weights from pretrained model not used in {}: {}".format(
408 | model.__class__.__name__, unexpected_keys))
409 | if len(error_msgs) > 0:
410 | raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
411 | model.__class__.__name__, "\n\t".join(error_msgs)))
412 |
413 | if hasattr(model, 'tie_weights'):
414 | model.tie_weights() # make sure word embedding weights are still tied
415 |
416 | # Set model in evaluation mode to desactivate DropOut modules by default
417 | model.eval()
418 |
419 | if output_loading_info:
420 | loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs}
421 | return model, loading_info
422 |
423 | return model
424 |
425 |
426 | class Conv1D(nn.Module):
427 | def __init__(self, nf, nx):
428 | """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
429 | Basically works like a Linear layer but the weights are transposed
430 | """
431 | super(Conv1D, self).__init__()
432 | self.nf = nf
433 | w = torch.empty(nx, nf)
434 | nn.init.normal_(w, std=0.02)
435 | self.weight = nn.Parameter(w)
436 | self.bias = nn.Parameter(torch.zeros(nf))
437 |
438 | def forward(self, x):
439 | size_out = x.size()[:-1] + (self.nf,)
440 | x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
441 | x = x.view(*size_out)
442 | return x
443 |
444 |
445 | class PoolerStartLogits(nn.Module):
446 | """ Compute SQuAD start_logits from sequence hidden states. """
447 | def __init__(self, config):
448 | super(PoolerStartLogits, self).__init__()
449 | self.dense = nn.Linear(config.hidden_size, 1)
450 |
451 | def forward(self, hidden_states, p_mask=None):
452 | """ Args:
453 | **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)`
454 | invalid position mask such as query and special symbols (PAD, SEP, CLS)
455 | 1.0 means token should be masked.
456 | """
457 | x = self.dense(hidden_states).squeeze(-1)
458 |
459 | if p_mask is not None:
460 | if next(self.parameters()).dtype == torch.float16:
461 | x = x * (1 - p_mask) - 65500 * p_mask
462 | else:
463 | x = x * (1 - p_mask) - 1e30 * p_mask
464 |
465 | return x
466 |
467 |
468 | class PoolerEndLogits(nn.Module):
469 | """ Compute SQuAD end_logits from sequence hidden states and start token hidden state.
470 | """
471 | def __init__(self, config):
472 | super(PoolerEndLogits, self).__init__()
473 | self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
474 | self.activation = nn.Tanh()
475 | self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
476 | self.dense_1 = nn.Linear(config.hidden_size, 1)
477 |
478 | def forward(self, hidden_states, start_states=None, start_positions=None, p_mask=None):
479 | """ Args:
480 | One of ``start_states``, ``start_positions`` should be not None.
481 | If both are set, ``start_positions`` overrides ``start_states``.
482 |
483 | **start_states**: ``torch.LongTensor`` of shape identical to hidden_states
484 | hidden states of the first tokens for the labeled span.
485 | **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
486 | position of the first token for the labeled span:
487 | **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
488 | Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
489 | 1.0 means token should be masked.
490 | """
491 | assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
492 | if start_positions is not None:
493 | slen, hsz = hidden_states.shape[-2:]
494 | start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
495 | start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz)
496 | start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz)
497 |
498 | x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
499 | x = self.activation(x)
500 | x = self.LayerNorm(x)
501 | x = self.dense_1(x).squeeze(-1)
502 |
503 | if p_mask is not None:
504 | if next(self.parameters()).dtype == torch.float16:
505 | x = x * (1 - p_mask) - 65500 * p_mask
506 | else:
507 | x = x * (1 - p_mask) - 1e30 * p_mask
508 |
509 | return x
510 |
511 |
512 | class PoolerAnswerClass(nn.Module):
513 | """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
514 | def __init__(self, config):
515 | super(PoolerAnswerClass, self).__init__()
516 | self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
517 | self.activation = nn.Tanh()
518 | self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
519 |
520 | def forward(self, hidden_states, start_states=None, start_positions=None, cls_index=None):
521 | """
522 | Args:
523 | One of ``start_states``, ``start_positions`` should be not None.
524 | If both are set, ``start_positions`` overrides ``start_states``.
525 |
526 | **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``.
527 | hidden states of the first tokens for the labeled span.
528 | **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
529 | position of the first token for the labeled span.
530 | **cls_index**: torch.LongTensor of shape ``(batch_size,)``
531 | position of the CLS token. If None, take the last token.
532 |
533 | note(Original repo):
534 | no dependency on end_feature so that we can obtain one single `cls_logits`
535 | for each sample
536 | """
537 | hsz = hidden_states.shape[-1]
538 | assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
539 | if start_positions is not None:
540 | start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
541 | start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz)
542 |
543 | if cls_index is not None:
544 | cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
545 | cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz)
546 | else:
547 | cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz)
548 |
549 | x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
550 | x = self.activation(x)
551 | x = self.dense_1(x).squeeze(-1)
552 |
553 | return x
554 |
555 |
556 | class SQuADHead(nn.Module):
557 | r""" A SQuAD head inspired by XLNet.
558 |
559 | Parameters:
560 | config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
561 |
562 | Inputs:
563 | **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
564 | hidden states of sequence tokens
565 | **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
566 | position of the first token for the labeled span.
567 | **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
568 | position of the last token for the labeled span.
569 | **cls_index**: torch.LongTensor of shape ``(batch_size,)``
570 | position of the CLS token. If None, take the last token.
571 | **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)``
572 | Whether the question has a possible answer in the paragraph or not.
573 | **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
574 | Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
575 | 1.0 means token should be masked.
576 |
577 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
578 | **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
579 | Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
580 | **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
581 | ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``
582 | Log probabilities for the top config.start_n_top start token possibilities (beam-search).
583 | **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
584 | ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
585 | Indices for the top config.start_n_top start token possibilities (beam-search).
586 | **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
587 | ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
588 | Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
589 | **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
590 | ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
591 | Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
592 | **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
593 | ``torch.FloatTensor`` of shape ``(batch_size,)``
594 | Log probabilities for the ``is_impossible`` label of the answers.
595 | """
596 | def __init__(self, config):
597 | super(SQuADHead, self).__init__()
598 | self.start_n_top = config.start_n_top
599 | self.end_n_top = config.end_n_top
600 |
601 | self.start_logits = PoolerStartLogits(config)
602 | self.end_logits = PoolerEndLogits(config)
603 | self.answer_class = PoolerAnswerClass(config)
604 |
605 | def forward(self, hidden_states, start_positions=None, end_positions=None,
606 | cls_index=None, is_impossible=None, p_mask=None):
607 | outputs = ()
608 |
609 | start_logits = self.start_logits(hidden_states, p_mask=p_mask)
610 |
611 | if start_positions is not None and end_positions is not None:
612 | # If we are on multi-GPU, let's remove the dimension added by batch splitting
613 | for x in (start_positions, end_positions, cls_index, is_impossible):
614 | if x is not None and x.dim() > 1:
615 | x.squeeze_(-1)
616 |
617 | # during training, compute the end logits based on the ground truth of the start position
618 | end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
619 |
620 | loss_fct = CrossEntropyLoss()
621 | start_loss = loss_fct(start_logits, start_positions)
622 | end_loss = loss_fct(end_logits, end_positions)
623 | total_loss = (start_loss + end_loss) / 2
624 |
625 | if cls_index is not None and is_impossible is not None:
626 | # Predict answerability from the representation of CLS and START
627 | cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
628 | loss_fct_cls = nn.BCEWithLogitsLoss()
629 | cls_loss = loss_fct_cls(cls_logits, is_impossible)
630 |
631 | # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
632 | total_loss += cls_loss * 0.5
633 |
634 | outputs = (total_loss,) + outputs
635 |
636 | else:
637 | # during inference, compute the end logits based on beam search
638 | bsz, slen, hsz = hidden_states.size()
639 | start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
640 |
641 | start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
642 | start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
643 | start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
644 | start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
645 |
646 | hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
647 | p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
648 | end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
649 | end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
650 |
651 | end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
652 | end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
653 | end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
654 |
655 | start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
656 | cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
657 |
658 | outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
659 |
660 | # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
661 | # or (if labels are provided) (total_loss,)
662 | return outputs
663 |
664 |
665 | class SequenceSummary(nn.Module):
666 | r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
667 | Args of the config class:
668 | summary_type:
669 | - 'last' => [default] take the last token hidden state (like XLNet)
670 | - 'first' => take the first token hidden state (like Bert)
671 | - 'mean' => take the mean of all tokens hidden states
672 | - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
673 | - 'attn' => Not implemented now, use multi-head attention
674 | summary_use_proj: Add a projection after the vector extraction
675 | summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
676 | summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
677 | summary_first_dropout: Add a dropout before the projection and activation
678 | summary_last_dropout: Add a dropout after the projection and activation
679 | """
680 | def __init__(self, config):
681 | super(SequenceSummary, self).__init__()
682 |
683 | self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
684 | if self.summary_type == 'attn':
685 | # We should use a standard multi-head attention module with absolute positional embedding for that.
686 | # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
687 | # We can probably just use the multi-head attention module of PyTorch >=1.1.0
688 | raise NotImplementedError
689 |
690 | self.summary = Identity()
691 | if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
692 | if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
693 | num_classes = config.num_labels
694 | else:
695 | num_classes = config.hidden_size
696 | self.summary = nn.Linear(config.hidden_size, num_classes)
697 |
698 | self.activation = Identity()
699 | if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
700 | self.activation = nn.Tanh()
701 |
702 | self.first_dropout = Identity()
703 | if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
704 | self.first_dropout = nn.Dropout(config.summary_first_dropout)
705 |
706 | self.last_dropout = Identity()
707 | if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
708 | self.last_dropout = nn.Dropout(config.summary_last_dropout)
709 |
710 | def forward(self, hidden_states, cls_index=None):
711 | """ hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer.
712 | cls_index: [optional] position of the classification token if summary_type == 'cls_index',
713 | shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
714 | if summary_type == 'cls_index' and cls_index is None:
715 | we take the last token of the sequence as classification token
716 | """
717 | if self.summary_type == 'last':
718 | output = hidden_states[:, -1]
719 | elif self.summary_type == 'first':
720 | output = hidden_states[:, 0]
721 | elif self.summary_type == 'mean':
722 | output = hidden_states.mean(dim=1)
723 | elif self.summary_type == 'cls_index':
724 | if cls_index is None:
725 | cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
726 | else:
727 | cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
728 | cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
729 | # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
730 | output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
731 | elif self.summary_type == 'attn':
732 | raise NotImplementedError
733 |
734 | output = self.first_dropout(output)
735 | output = self.summary(output)
736 | output = self.activation(output)
737 | output = self.last_dropout(output)
738 |
739 | return output
740 |
741 |
742 | def prune_linear_layer(layer, index, dim=0):
743 | """ Prune a linear layer (a model parameters) to keep only entries in index.
744 | Return the pruned layer as a new layer with requires_grad=True.
745 | Used to remove heads.
746 | """
747 | index = index.to(layer.weight.device)
748 | W = layer.weight.index_select(dim, index).clone().detach()
749 | if layer.bias is not None:
750 | if dim == 1:
751 | b = layer.bias.clone().detach()
752 | else:
753 | b = layer.bias[index].clone().detach()
754 | new_size = list(layer.weight.size())
755 | new_size[dim] = len(index)
756 | new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
757 | new_layer.weight.requires_grad = False
758 | new_layer.weight.copy_(W.contiguous())
759 | new_layer.weight.requires_grad = True
760 | if layer.bias is not None:
761 | new_layer.bias.requires_grad = False
762 | new_layer.bias.copy_(b.contiguous())
763 | new_layer.bias.requires_grad = True
764 | return new_layer
765 |
766 |
767 | def prune_conv1d_layer(layer, index, dim=1):
768 | """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
769 | A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
770 | Return the pruned layer as a new layer with requires_grad=True.
771 | Used to remove heads.
772 | """
773 | index = index.to(layer.weight.device)
774 | W = layer.weight.index_select(dim, index).clone().detach()
775 | if dim == 0:
776 | b = layer.bias.clone().detach()
777 | else:
778 | b = layer.bias[index].clone().detach()
779 | new_size = list(layer.weight.size())
780 | new_size[dim] = len(index)
781 | new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
782 | new_layer.weight.requires_grad = False
783 | new_layer.weight.copy_(W.contiguous())
784 | new_layer.weight.requires_grad = True
785 | new_layer.bias.requires_grad = False
786 | new_layer.bias.copy_(b.contiguous())
787 | new_layer.bias.requires_grad = True
788 | return new_layer
789 |
790 |
791 | def prune_layer(layer, index, dim=None):
792 | """ Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index.
793 | Return the pruned layer as a new layer with requires_grad=True.
794 | Used to remove heads.
795 | """
796 | if isinstance(layer, nn.Linear):
797 | return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
798 | elif isinstance(layer, Conv1D):
799 | return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
800 | else:
801 | raise ValueError("Can't prune layer of class {}".format(layer.__class__))
802 |
--------------------------------------------------------------------------------